In [1]:
pip install transformers==4.31.0 peft==0.4.0 trl==0.7.4 accelerate==0.21.0 bitsandbytes datasets


Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Collecting trl==0.7.4
  Downloading trl-0.7.4-py3-none-any.whl.metadata (10 kB)
Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl.metadata (17 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers==4.31.0)
  Downloading regex-2025.8.29-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 k

In [2]:
# 1. 환경 설정
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForLanguageModeling)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import torch




In [3]:
from trl import SFTTrainer, DPOTrainer
from transformers import DataCollatorForLanguageModeling

In [4]:
# 2. Hugging Face 로그인

from huggingface_hub import login
login("---")

In [13]:
# 3. 데이터셋 불러오기
import json
from datasets import Dataset

def load_jsonl_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return Dataset.from_list(data)

train_dataset = load_jsonl_dataset("train_dataset.jsonl")
eval_dataset = load_jsonl_dataset("test_dataset.jsonl")

In [14]:
# 4. model 로드, tokenizer 설정
model_name = "meta-llama/Llama-2-7b-chat-hf"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token




config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [15]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# 양자화된 모델을 LoRA 훈련에 맞게 준비
model = prepare_model_for_kbit_training(model)

# LoRA 구성 설정
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # LLaMA 계열에서 일반적으로 사용
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# LoRA 모델 적용
model = get_peft_model(model, lora_config)


In [16]:
# 5. Collator 정의
from trl import DataCollatorForCompletionOnlyLM

collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer,
    response_template="[/INST]",  # 응답은 [/INST] 뒤부터 시작
    instruction_template="[INST]"  # (선택 사항이지만 명시하면 더 안전)
)


In [17]:
# 6. training_args 설정
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    save_strategy="no",  # 체크포인트 저장 생략
    logging_steps=5,
    fp16=False,
    bf16=True,
    report_to="none"
)

In [21]:
# 7. SFT Trainer 학습

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_args,
    data_collator=collator,
    packing=False    #datacollatorforcompletiononlyLM은 packing 사용 x
)




Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [22]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

Support Argilla Cloud offers comprehensive support services to address various issues that may arise during the use of our service.Support levels are categorized into four distinct tiers, based on the severity of the issue, and a separate category for feature requests.The support process, response times, and procedures differ for each category.(1) Critical Issues Critical issues are characterized by: Severe impact on the Service, potentially rendering it completely non-functional.Disruption of critical service operations or functions.Obstruction of entire customer workflows.In the case of a critical issue, Argilla will: Assign specialist(s) to correct the issue on an expedited basis.Provide ongoing communication on the status via email and/or phone, according to

Step,Training Loss
5,2.3481
10,2.081
15,1.5924
20,1.4018
25,1.2859
30,1.3519
35,1.13
40,1.1363
45,1.1864
50,1.3194



Starter: Ideal for teams initiating their journey in scaling data curation and labelling projects.Perfect for environments where production monitoring is not a requirement.Base: Tailored for teams seeking to amplify their data curation, labelling efforts, and model monitoring, with enhanced support from Argilla.Medium: Designed for teams expanding their language model pipelines, requiring robust ML lifecycle management fortified by Argilla's comprehensive support.Large: Geared towards teams heavily dependent on language model pipelines, human feedback, and applications, requiring complete ML lifecycle management with robust support.Scope of services Argilla Cloud, a fully managed SaaS, encompasses the following functionalities: Unrestricted Users, Datasets, and Workspaces: The service imposes no limits on the number of users, datasets, or workspaces, supporting scalability of operations.Role-Based Access Control: Administrators and annotators have differentiated access rights to ensur

TrainOutput(global_step=120, training_loss=1.06548676888148, metrics={'train_runtime': 99.0171, 'train_samples_per_second': 2.424, 'train_steps_per_second': 1.212, 'total_flos': 2634879451742208.0, 'train_loss': 1.06548676888148, 'epoch': 1.0})

In [23]:
# 8.저장 (LoRA adapter + tokenizer)
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

#압축, 다운로드 준비
import shutil
shutil.make_archive("model_2_output", 'zip', "./results")

#다운로드 링크 표시
from IPython.display import FileLink
FileLink("model_2_output.zip")


In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import PeftModel

# # 1. base model 로드 (원래 쓰던 LLaMA 모델)
# base_model = AutoModelForCausalLM.from_pretrained(
#     "meta-llama/Llama-2-7b-chat-hf",
#     device_map="auto",
#     trust_remote_code=True
# )

# # 2. tokenizer 로드
# tokenizer = AutoTokenizer.from_pretrained("model_output")

# # 3. LoRA adapter 붙이기
# model = PeftModel.from_pretrained(base_model, "model_output")


In [15]:
# 9. inference

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel  # ❗ 중요

# base 모델과 tokenizer 로드
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",  # 학습할 때 썼던 base 모델 이름
    device_map="auto",
    torch_dtype=torch.bfloat16,       # 양자화 환경일 경우 맞춰줌
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained("./results")

# LoRA adapter 붙이기
model = PeftModel.from_pretrained(base_model, "./results").to("cuda")
model.eval()




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): 

In [25]:
# 프롬프트 입력
instruction = "토익 공부 계획 짜줘"
prompt = f"<s>### Instruction\n{instruction}\n### Response"

# 토크나이즈
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# 생성
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id
    )

# 출력 디코딩
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)


### Instruction
토익 공부 계획 짜줘
### Response
Here is a study plan for toxicology:

1. Introduction to Toxicology (20 hours)
	* Definition of toxicology and its importance
	* Historical background and development of toxicology
	* Types of toxicology (environmental, occupational, clinical, etc.)
2. Cellular and Molecular Toxicology (40 hours)
	* Cellular and molecular mechanisms of toxicity
	


In [17]:
instruction = "판타지 소설의 첫 문장을 써줘"
prompt = f"<s>### Instruction\n{instruction}\n### Response"

In [18]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id
    )

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)


### Instruction
판타지 소설의 첫 문장을 써줘
### Response
The first sentence of a short story is crucial in setting the tone and grabbing the reader's attention. Here are some effective ways to start a short story:

1. With a hook: Begin with an interesting or provocative statement that draws the reader in and makes them want to know more. For example, "The last time I saw my mother, she was dead."
2. With action: Start with a scene that is full of energy and movement, such as a character running or a dramatic event unfolding. For example, "The storm raged outside, but Sarah couldn't escape the darkness within."
3. With dialogue: Begin with a conversation between characters that reveals important


In [19]:
instruction = "탄소중립이 뭔지 초등학생도 이해할 수 있게 설명해줘"
prompt = f"<s>### Instruction\n{instruction}\n### Response"

In [20]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id
    )

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

### Instruction
탄소중립이 뭔지 초등학생도 이해할 수 있게 설명해줘
### Response
* Sure, I'd be happy to explain it in a way that is easy for elementary school students to understand!

So, you know how we have different types of things in the world, like animals, plants, and even objects like chairs and tables? Well, "instruction" is like a special kind of thing that helps us learn new things. It's like a recipe or a set of steps that tells us how to do something we don't already know how to do.

For example, if you want to make a peanut butter and jelly sandwich, you need an instruction to tell you what to do. First, you need to get bread, then you need to


In [21]:
from ipywidgets import interact

def simple_chat(instruction):
    formatted = f"<s>### Instruction\n{instruction}\n### Response"
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(response.split("### Response")[-1].strip())

interact(simple_chat, instruction="토익 공부 계획 짜줘")


interactive(children=(Text(value='토익 공부 계획 짜줘', description='instruction'), Output()), _dom_classes=('widget-i…

<function __main__.simple_chat(instruction)>