In [None]:
%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps xformers[cuda] "trl<0.9.0" peft accelerate bitsandbytes

: 

In [1]:
import os
current_directory = os.getcwd()
print(current_directory)

/home/kwb425/projects/lina/track3/llama3


In [None]:
# Kill all processes running on GPU
sh = """
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader,nounits)

# Kill each process
for pid in $pids; do
    sudo kill -9 $pid
done
"""
with open('script.sh', 'w') as file:
    file.write(sh)

!bash script.sh
!nvidia-smi

: 

* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
* With [PR 26037](https://github.com/huggingface/transformers/pull/26037), we support downloading 4bit models **4x faster**! [Our repo](https://huggingface.co/unsloth) has Llama, Mistral 4bit models.
* [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Meta-Llama-3.1-70B-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.536 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

### Test Inference before gradio

In [None]:
from unsloth.chat_templates import get_chat_template

# added, otherwise, error
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
###############

messages = [
    {"from": "human", "value": "7대 고액암에 대해 설명해줘!"},
    {"from": "gpt", "value": "보험약관에서 정한 암 중 백혈병, 뇌암, 골수암, 식도암, 담낭암, 담도암, 췌장암을 말합니다."},
    {"from": "human", "value": "유방암은 포함 안 돼?"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

In [None]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

7대 고액암에 대해 설명해줘!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

보험약관에서 정한 암 중 백혈병, 뇌암, 골수암, 식도암, 담낭암, 담도암, 췌장암을 말합니다.<|eot_id|><|start_header_id|>user<|end_header_id|>

유방암은 포함 안 돼?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

유방암은 女성암 중 한 가지입니다. 보험약관에서 정한 여성암은 유방암, 생식암(자궁내막암, 자궁근종, 자궁neck암, 난소암, 생식관암), 기타피부암을 말합니다. <|eot_id|>


You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

### Gradio Demo

In [None]:
import gradio as gr
import torch
from threading import Thread
from transformers import TextIteratorStreamer
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def predict(message, history):

    history_transformer_format = history + [[message, ""]]

    messages = [{"from": "system", "value": "You are a helpful AI assistant. Please follow the user's request kindly."}]
    for item in history_transformer_format:
        messages = messages + [{"from": "human", "value": item[0]}]
        messages = messages + [{"from": "gpt", "value": item[1]}]

    messages[-1]['value'] = ""
    model_inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")

    text_streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        input_ids = model_inputs,
        streamer = text_streamer,
        max_new_tokens = 10000,
        use_cache = True,
        pad_token_id=tokenizer.eos_token_id,
        )

    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    partial_message = ""
    for new_token in text_streamer:
        if new_token != '<':
            partial_message += new_token
            yield partial_message

gr.ChatInterface(predict).launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://9319372a66bb20bb07.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Exception in thread Thread-15 (_fast_generate):
Traceback (most recent call last):
  File "/home/hosungnam/Applications/miniconda3/envs/1/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/hosungnam/Applications/miniconda3/envs/1/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "/home/hosungnam/Applications/miniconda3/envs/1/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/hosungnam/Applications/miniconda3/envs/1/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/hosungnam/Applications/miniconda3/envs/1/lib/python3.10/site-packages/unsloth/models/llama.py", line 1194, in _fast_generate
    output = generate(*args, **kwargs)
  File "/home/hosungnam/Applications/miniconda3/envs/1/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 1