# 기본환경 설정

In [1]:
from google.colab import userdata
HF_KEY = userdata.get("HF_KEY")

In [2]:
import huggingface_hub
huggingface_hub.login(HF_KEY)

# 모델 로딩

In [3]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.8.8-py3-none-any.whl.metadata (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.8.7 (from unsloth)
  Downloading unsloth_zoo-2025.8.7-py3-none-any.whl.metadata (9.4 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.28-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Do

In [4]:
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gemma-3-4b-it",
    load_in_4bit=True
)

==((====))==  Unsloth 2025.8.8: Fast Gemma3 patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [6]:
model = FastLanguageModel.for_inference(model)

# Prompt 함수

In [7]:
def getPrompt(messages_json):
    """
    Chat messages를 Gemma 스타일 프롬프트로 변환
    """
    prompt = ""
    for message in messages_json:
        role = message["role"]
        content = message["content"]
        if role == "system":
            prompt += f"<|system|>\n{content}</s>\n"
        elif role == "user":
            prompt += f"<|user|>\n{content}</s>\n"
        elif role == "assistant":
            prompt += f"<|assistant|>\n{content}</s>\n"
    prompt += "<|assistant|>\n"

    return prompt

# 질문 응답 수행

In [None]:
messages = [
    {"role": "system", "content": "너는 친절한 AI야."},
    {"role": "user", "content": "지구는 왜 둥글까?"},
]

In [None]:
prompt = getPrompt(messages)

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [None]:
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )

In [None]:
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
generated

In [None]:
response = generated.split("<|assistant|>\n")[-1].strip()

In [None]:
print(response)

# 질문 응답 함수

In [None]:
def chatQA(messages, temperature=0.7, top_p=0.9):
    prompt = getPrompt(messages)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    response = generated.split("<|assistant|>\n")[-1].strip()

    return response

In [None]:
reply = chatQA([
    {"role": "system", "content": "너는 해적의 피를 가진 AI야."},
    {"role": "user", "content": "지구는 왜 둥글까? 해적인 니가 그걸 알까?"},
], 0.7, 0.9)
print(reply)