##### Torch, CUDA, cuDDN, 버전 확인

In [1]:
import torch

# CUDA 사용 가능 여부 확인
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print("Torch version:{}".format(torch.__version__))
print("cuda version: {}".format(torch.version.cuda))
print("cudnn version:{}".format(torch.backends.cudnn.version()))

Using device: cuda
Torch version:2.3.0+cu121
cuda version: 12.1
cudnn version:8906


##### 필요 라이브러리 설치

In [2]:
#양자화에 필요한 패키지 설치
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install trl
!pip install nltk
!pip install jsonlines
!pip install datasets
from datasets import Dataset

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.t

* accelerate : hugging face 학습루프 가속화 라이브러리
* peft: LoRA, Prefix Tuning, P-Tuing, Prompt Tuning 과 같은 기법들을 쉽게 사용하도록 나온 라이브러리
* bitsandbytes: gpu 에서 모델을 손쉽게 압축할 수 있는 라이브러리
* trl: TRL (Transformer Reinforcement Learning) 은 transfomer언어 모델의 훈련을 위한 풀스택 라이브러리

In [3]:
import pandas as pd
import numpy as np
import random
import re

##### Google Dirve Mount

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### csv file read

In [5]:
df = pd.read_csv('/content/drive/MyDrive/open/news.csv')

In [6]:
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


##### title 열과 contents 열을 연결하여 text열 생성

In [7]:
df['text'] = df['title'] + " :" + df['contents']

df['text'].head()

0    Spanish coach facing action in race row :MADRI...
1    Bruce Lee statue for divided city :In Bosnia, ...
2    Only Lovers Left Alive's Tilda Swinton Talks A...
3    Macromedia contributes to eBay Stores :Macrome...
4    Qualcomm plans to phone it in on cellular repa...
Name: text, dtype: object

##### NLTK에서 영어 불용어 제거

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##### Text Preprocessing 1


In [10]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 내용과 무관한 태그 제거
    text = re.sub(r'target=\/\S+','' , text)
    text = re.sub(r'&lt\;\S+', '', text)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    # 불용어 제거
    text = word_tokenize(text)
    tokens_without_sw = [word for word in text if not word in all_stopwords]
    text = (" ").join(tokens_without_sw)

    return text.lower()

Text Preprocessing 2(추가)

In [11]:
# 단일 문자 제거
def remove_single_char(text, threshold=1):
    words = word_tokenize(text)
    text = ' '.join([word for word in words if len(word) > threshold])
    return text

# 구두점 제거
def remove_punctuation(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

# 불필요한 공백 제거
def remove_extra_whitespaces(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

##### 전처리 적용

In [12]:
df['text'] = df['text'].apply(preprocess_text)
df['text'] = df['text'].apply(remove_single_char)
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(remove_extra_whitespaces)
df.to_csv('./pre_df.csv')
print(df['text'].head())

0    spanish coach facing action race row madrid af...
1    bruce lee statue divided city in bosnia one ma...
2    only lovers left alive s tilda swinton talks a...
3    macromedia contributes ebay stores macromedia ...
4    qualcomm plans phone cellular repairs over the...
Name: text, dtype: object


###### huggingface hub 로그인

In [13]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

##### csv 파일을 json 파일로 변환

In [14]:
import json
import jsonlines

dataPath = "/content/drive/MyDrive/open/"

# 데이터 경로 설정 및 불러오기
datasetName = "ft_data.csv"
jsonFileName = "tf_data.json"

def csv_to_json(csv_file_path, json_file_path):
    # CSV 파일을 DataFrame으로 읽기
    df = pd.read_csv(csv_file_path)

    # JSON 파일로 저장
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        # 각 행을 JSON으로 변환하여 바로 파일에 쓰기
        for index, row in df.iterrows():
            data = {'text': row['text'], 'label': row['label']}
            json.dump(data, json_file, ensure_ascii=False)
            json_file.write('\n')  # 각 행마다 줄바꿈

# CSV 파일 경로와 JSON 파일 경로 설정
csv_file_path = dataPath + datasetName
json_file_path = dataPath + jsonFileName

# 함수 호출
csv_to_json(csv_file_path, json_file_path)

* 추가 학습 데이터를 ft_data.csv로 생성해놓음(text, label 형태)
* 해당 csv파일을 json 형식으로 변환

##### 데이터 셋 생성

In [15]:
indataset=[]
with jsonlines.open(json_file_path) as f:
  for line in f.iter():
    indataset.append(f'[INST] {line["text"]} [/INST] {line["label"]} ')

# 데이터셋 생성 및 저장
indataset=Dataset.from_dict({"text": indataset})
indataset.save_to_disk(dataPath)

print(indataset)

Saving the dataset (0/1 shards):   0%|          | 0/48 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 48
})


##### 생성된 데이터 셋을 허깅페이스에 로드

In [16]:
indataset.push_to_hub("madcoww/recover_label")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/266 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/madcoww/recover_label/commit/e904d3b3d60fe05e2903c7304f75c93676d4b6c4', commit_message='Upload dataset', commit_description='', oid='e904d3b3d60fe05e2903c7304f75c93676d4b6c4', pr_url=None, pr_revision=None, pr_num=None)

##### 모델 선언

In [17]:
base_model = "meta-llama/Llama-2-7b-hf"

dataset_tf = "madcoww/recover_label"

new_model = "llama-2-7b-news"

##### 데이터셋 로드

In [18]:
from datasets import load_dataset
dataset = load_dataset(dataset_tf, split="train")

Downloading readme:   0%|          | 0.00/266 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/48 [00:00<?, ? examples/s]

In [19]:
dataset

Dataset({
    features: ['text'],
    num_rows: 48
})

##### 양자화 config 설정

In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # bnb_4bit_use_double_quant=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

* load_in_4bit : 모델 로드 시 4비트 양자화를 사용하도록 설정
* bnb_4bit_use_double_quant : 더블 양자화 설정 ex) 8bit -> 4bit
* bnb_4bit_quant_type : 양자화 유형 nf4(normal float 4)
* bnb_4bit_compute_dtype : 4비트 양자화된 데이터를 처리할 때 사용할 데이터 타입(float16, float32)


##### 모델 로드

In [21]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer
from peft import LoraConfig

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0},
    token=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

* 사전 학습된 모델을 로드
* quantization_config : 양자화 설정을 지정한 매개변수
* device_map 특성 디바이스 할당(0:GPU)
* model.config.use_cache : 이전 계산 결과를 캐시할지 여부(메모리 절약)
* model.config.pretraining_tp : 1은 텐서 병렬화가 적용되지 않음을 의미 단일 디바이스에서 실행

##### 토크나이저 로드

In [22]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

##### 모델 정보

In [23]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

### In-Context Learning(few-shot Learning) by Prompt

##### Prompt 준비

In [51]:
def create_prompt(input_texts):
  prompts = []
  for input_text in input_texts:
    prompt = f"""
      Instead of generating a new sentence, just look at the example and tell me which category among the six (Business, Entertainment, Politics, Sport, Tech, World) is appropriate for the latest news:

    - [News Content]: "two investment banks settle sec washington reuters deutsche bank securities inc thomas weisel partners agreed pay combined million settle charges involving conflicts interest research investment banking u s regulators said thursday"
    - [Labels]: business

    - [News Content]: "yukos chief plans return to russia the american chief executive yukos embattled oil producer whose top executives left russia week ago feared government prosecution said yesterday quot fully intends"
    - [Labels]: business

    - [News Content]: "oil prices soar supply shortage fears oil prices soared new highs sides atlantic today traders fretted increasingly short supplies the cost crude new york broke"
    - [Labels]: business

    - [News Content]: "report lehman near deal enron lawsuits reuters reuters lehman brothers holdings inc close settling class action lawsuit million stemming allegations colluded other brokerages mislead enron corp shareholders the wall street journal reported Thursday"
    - [Labels]: business

    - [News Content]: "dow jones agrees buy marketwatch million deal dow jones company publisher the wall street journal agreed buy marketwatch parent company financial news web site cbs marketwatch approximately million companies said today"
    - [Labels]: business

    - [News Content]: "sears sales slightly october apparel still slumping hoffman estates ill hoffman estates based sears roebuck company reported slight increase october same store sales today despite continuing weak results apparel"
    - [Labels]: business

    - [News Content]: "weighing outsourcing s impact key factors help determine outsourcing benefits hurts americans"
    - [Labels]: business

    - [News Content]: "adb president tadao chino resigns the president manila based asian development bank tuesday announced resignation multilateral bank effective next year"
    - [Labels]: business

    - [News Content]: "fran mires talks rayhein ala fein egypt s popular reality show diff authors bit skeptical ten years al hurra achieved great deal mena region grand part due presence fran mires program developer television executive network"
    - [Labels]: entertainment

    - [News Content]: "lee create new film superhero comic book veteran stan lee team producer robert evans create movie featuring new superhero foreverman focus character face problems everyday life well using special powers save world paramount pictures studio behind film revealed details project say potential spawn series films lee best known work spider man the incredible hulk he collaborating script screenwriter peter briggs penned recent comic book adaptation hellboy we believe truly whole new franchise said gill champion president chief executive lee s pow entertainment in world people looking something different stan s idea create concept seen become evergreen franchise paramount many lee s creations including x men daredevil turned films past five years however spider man series biggest box office hit original sequel taking almost bn worldwide third spider man film scheduled release another marvel comics adaptation the fantastic four released cinemas summer"
    - [Labels]: entertainment

    - [News Content]: "harry potter ip claim pinned beaches tour company shows warner brothers churchill spirit"
    - [Labels]: entertainment

    - [News Content]: "rogue one dominates the holiday box office authors the star wars film took million christmas eve projected finish holiday monday total million domestically"
    - [Labels]: entertainment

    - [News Content]: "vera drake leads uk oscar hopes mike leigh s film vera drake lead british hopes year s academy awards getting three nominations imelda staunton nominated best actress role abortion drama leigh received nods best director original screenplay kate winslet also nominated best actress category role eternal sunshine spotless mind and clive owen sophie okonedo got nominated supporting roles closer hotel rwanda respectively owen already made bookmakers favourite best supporting actor role closer already clinched golden globe award and first nomination actress okonedo chosen performance hotel rwanda rwandan genocide it also debut nomination staunton told bbc news thought film would appeal academy voters it extraordinary time making film ca n t believe happened morning said hope shows mike extraordinary filmmaker we also dealing difficult subject matter amazing accepted way leigh previously received three oscar nominations secrets lies topsy turvy told bbc news latest success amazing he said we hoped imelda staunton would get nomination never expected get director screenplay it s absolutely wonderful think people aware s life hope warmth compassion really talks people winslet said ecstatic fourth nomination career being nominated means much to nominated film released ago feel honoured overwhelmed said john woodward chief executive uk film council said extremely heartening see british filmmaking talent recognised global stage britain hugely talented industry nominations show national lottery investment film pays major dividends culture economy among total british nominees composer andrew lloyd webber lyricist charles hart best original song learn to be lonely the phantom opera movie cinematographer john mathieson nominated gladiator also the phantom opera and finding neverland garnered two nominations brits gemma jackson also worked bridget jones s diary iris art direction costume designer alexandra byrne whose previous films included captain corelli s mandolin elizabeth running the uk two contenders best live action short film category wasp made ex children s tv presenter andrea arnold little terrorist work ashvin kumar this year s awards handed hollywood february"
    - [Labels]: entertainment

    - [News Content]: "disney takes sides battle next generation dvd hollywood movie powerhouse walt disney taken sides japans sony corp bitter battle studios define technical standard next generation dvds said"
    - [Labels]: entertainment

    - [News Content]: "playing traumas war are games based vietnam conflict making us immune realities history"
    - [Labels]: entertainment

    - [News Content]: "how lifetime s unreal tackles the princess fantasy authors the suitor prize youure maidens trying glass slipper"
    - [Labels]: entertainment

    - [News Content]: "the clinton campaign will fight you on twitter authors now that s win election"
    - [Labels]: politics

    - [News Content]: "did trump collude with russia or obstruct justice probably both authors just follow facts"
    - [Labels]: politics

    - [News Content]: "trump is governing like traditional republican authors he may always sound like one actions line gop s agenda"
    - [Labels]: politics

    - [News Content]: "kerry unfit lead amid great threats bush says reuters reuters president bush labeled sen john kerry saturday political opportunist unfit lead amid great threats america"
    - [Labels]: politics

    - [News Content]: "another bush era staffer endorses hillary clinton authors the list keeps getting longer"
    - [Labels]: politics

    - [News Content]: "lot of americans think abortion is just as bad as attacking abortion clinics authors most americans say colorado planned parenthood shooting act terrorism feel abortion providers better"
    - [Labels]: politics

    - [News Content]: "democrats are n t quite ready for their primary to end authors bernie stay little bit longer"
    - [Labels]: politics

    - [News Content]: "mike pence suggested not torturing terrorism suspects was like using oprah winfrey methods authors this guy s supposed prevent trump bringing back waterboarding"
    - [Labels]: politics

    - [News Content]: "lionel messi says kobe bryant was the reason he got into basketball authors game recognize game"
    - [Labels]: sports

    - [News Content]: "al wrap texas completes sweep oakland new york reuters david dellucci s two run double bottom ninth inning helped texas rangers complete three game sweep oakland athletics win american league arlington thursday"
    - [Labels]: sports

    - [News Content]: "even brazilian icon rivaldo thinks we should stay away from rio authors youull putting life risk here ud"
    - [Labels]: sports

    - [News Content]: "what s that baseball s back n t paid much attention baseball lately scintillating presidential campaign scintillating fall television season britney spears s scintillating latest marriage imagine surprise woke afternoon latest developments"
    - [Labels]: sports

    - [News Content]: "germans secure place next champions trophy lahore two second half goals florian keller helped germany beat new zealand sunday pakistan national stadium city lahore guaranteeing place men field hockey champions trophy next year"
    - [Labels]: sports

    - [News Content]: "double dip new york maybe seem mere whistling bronx pledges manager terry francona general manager theo epstein even boston s loss game red sox would somehow find way overcome possible loss curt schilling rest american league championship series"
    - [Labels]: sports

    - [News Content]: "kick off whistle blows tampa bay leprechaun makes move manchester united directors barely disguised gritted teeth yesterday issued statement outside world awaiting months britain richest football club received possible takeover offer"
    - [Labels]: sports

    - [News Content]: "champions league group roundup including tale it tale two penalties highbury arsenal could manage draw panathinaikos henry converted arsenal spot kick open scoring minutes basinas failed deliver"
    - [Labels]: sports

    - [News Content]: "emc makes smb channel macintosh play with dantz acquisition emc scored triple play acquisition dantz development small developer data backup restore software retrospect brand"
    - [Labels]: tech

    - [News Content]: "star wars peaceful life mars nasa gets the world dreaming if peace earth longer feasible end century may another option move mars yesterday head nasa surface exploration mission said find water red"
    - [Labels]: tech

    - [News Content]: "nasa tries break speed record with last plane test update nasa test latest jet powered aircraft final time next week aiming set world record flying times speed sound"
    - [Labels]: tech

    - [News Content]: "can microsoft stomp itunes with store its own microsoft aiming market apple computer pioneered year ago itunes online music store"
    - [Labels]: tech

    - [News Content]: "gadget market to grow the explosion consumer technology continue delegates world s largest gadget show las vegas told the number gadgets shops predicted grow devices talk become increasingly important everything going digital kirsten pfeifer consumer electronics association told bbc news website the consumer electronics show ces featured pick s products consumers controlling want technologies like hdtvs high definition tvs digital radio digital cameras remain strong all products show really showed breadth depth industry despite showing diversity delegates attending complained showcase lacked much wow factor previous years the portable technologies show also reflected one buzzwords ces time place shifting multimedia content able watch listen video music anywhere time at start last year s ces cea predicted would average growth that figure surpassed rise popularity portable digital music players personal video recorders digital cameras it clear also gadgets becoming lot lifestyle choice fashion personalisation becoming increasingly key way gadgets designed part rise spending power generation x ers grown technology spending power desire devices suit more consumer electronics market made female buyers according cea research hybrid devices combine number multimedia functions also evidence show floor lot driven ability said stephen baker consumer electronics analyst retail research firm npd group some functions cost next nothing add as well show floor showcasing everything tiny wearable mp players giant high definition tvs several keynote speeches made industry leaders microsoft chief bill gates despite several embarrassing technical glitches mr gate s pre show speech announced several new partnerships mainly us market he unveiled new ways letting people take tv shows recorded personal video recorders watch back portable devices he disappointed however failing announce details next generation xbox games console another disappointment lack exposure sony s new portable games device psp show sony said much anticipated gadget would likely start shipping march us europe it went sale japan christmas there two psps embedded glass cabinets show though representatives discuss details sony representative told bbc news website sony consider part consumer technology offering elsewhere show plethora colour plasma screens including samsung s inch metre plasma largest world industry experts also excited high definition technologies coming fore new formats dvds coming hold six times much data conventional dvds with many devices move lot products show offering external storage like seagate s gb pocket sized external hard drive innovation engineering design prize more trade professionals attended ces las vegas officially ran january"
    - [Labels]: tech

    - [News Content]: "intel appoints otellini ceo intel world largest computer chip maker announced thurs day directors confirmed selection company president paul otellini chief executive"
    - [Labels]: tech

    - [News Content]: "hm wonder what mark zuckerberg us up to on facebook right now authors we re watching mark"
    - [Labels]: tech

    - [News Content]: "microsoft msn desktop search is out in beta microsoft msn desktop search is out in beta as expected microsoft finally released beta desktop search software it integrated browser toolbar set labeled msn toolbar suite beta the download size huge compared google desktop search megs would"
    - [Labels]: tech

    - [News Content]: "israeli fire kills girl gaza palestinian girl reported killed israeli fire hours mortars aimed jewish enclave hurt four"
    - [Labels]: world

    - [News Content]: "roundup abuja talks winding road peace darfur lagos sept xinhuanetby dai adi lin xiaochun the road sustainable peace war ravaged darfur region western sudan might long tortuous one on going african union au sponsored"
    - [Labels]: world

    - [News Content]: "new brussels blow turkey eu hopes eu farm commissioner franz fischler friday became latest brussels critic raise doubts turkey hopes joining bloc wrangling ankara eu bid heats"
    - [Labels]: world

    - [News Content]: "hobbit find raises debate humanity sydney the discovery skeleton female barely one metre tall hunted pygmy elephants giant rats years ago could force reassessment origins humanity scientists australia said yesterday"
    - [Labels]: world

    - [News Content]: "climate researchers toast vintners grape harvest records while oenophiles may pore grape harvest records search perfect vintage wine team french scientists historians raising glasses toast insight records yield past climate"
    - [Labels]: world

    - [News Content]: "this week guantanamo detainees arraigned guantanamo bay naval base cuba the first guantanamo detainees arraigned terrorism charges alleged al qaeda accountant poet accused crafting terrorist propaganda man accused driver osama bin laden"
    - [Labels]: world

    - [News Content]: "blair was warned post war iraq chaos paper london reuters britain s foreign secretary senior officials warned prime minister tony blair year invading iraq chaos could follow toppling saddam hussein newspaper said saturday"
    - [Labels]: world

    - [News Content]: "dutch raid kurdish training camps arrest amsterdam reuters dutch authorities raided suspected training camp kurdistan workers party pkk guerrilla group southern netherlands arrested people prosecutors said friday"
    - [Labels]: world

    - [News Content]: "{input_text}"
     - [Labels]:
    """
    prompts.append(prompt)
  return prompts

input_texts = [
  "israel kills palestinians big gaza incursion reuters israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip third time as many months quell palestinian rocket fire israel",
  "john kerry confident the iran deal will survive past obama plots his next big task authors the secretary state boldly predicts deal percent approval rating",
  "spain u s match expected set record ap ap this week s u s spain davis cup final expected break attendance record sanctioned tennis match",
  "cisco unveils new routers business chicago reuters cisco systems inc href biggest maker network gear directing internet traffic monday said introduce new line low end routers coming weeks aimed winning corporate business"
]

prompts = create_prompt(input_texts)

* 예시 데이터 외의 다른 4가지 데이터(레이블이 다른)를 가지고 4가지의 Prompt 생성

In [52]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", max_new_tokens=150)

generated_text_list = []

for prompt in input_texts:
    result = pipe(prompt)
    generated_text = result[0]['generated_text']
    generated_text_list.append(generated_text)

In [53]:
generated_text_list

["israel kills palestinians big gaza incursion reuters israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip third time as many months quell palestinian rocket fire israel's army said. Unterscheidung israel palestine conflict the israel palestine conflict also known as the arab israeli conflict or the middle east conflict is a ongoing and mostly armed conflict between nationalistic jews and arabs in the middle east the conflict began in the middle east during the late 19th century and has continued to the present day.",
 "john kerry confident the iran deal will survive past obama plots his next big task authors the secretary state boldly predicts deal percent approval rating will survive even if president obama is voted out of office. everybody is back with me now. we're going to talk about this deal. we're going to talk about the fact that the president is still on the campaign trail and he's still not talking about the deal. but

* 생성된 데이터를 확인해보면 입력한 데이터와 추가적인 텍스트를 생성하는 것을 보임
* 하지만 레이블(카테고리)만을 원하기에 Prompt방식 및 Propmt 양식이 잘못되었다고 생각

### Fine-tuning (데이터 재학습)

##### PETF 설정

In [54]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

* lora_alpha: LoRA의 scaling factor, 로우랭크 행렬에 적용되는 스케일링 값
* lora_dropout: LoRA의 드롭아웃 비율
* r: 로우랭크 행렬의 랭크. 이 값은 LoRA의 압축 비율을 결정
* bias: bias를 사용할지 여부를 결정합니다.
* ask_type: 모델의 작업의 유형입니다."CAUSAL_LM" (Causal Language Modeling)

##### TrainingArguments 설정

In [56]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    # bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

* output_dir: 학습 결과가 저장될 디렉토리 (모델 체크포인트와 로그)
* num_train_epochs: 학습 에포크 수
* per_device_train_batch_size: 학습 시 배치 크기
* gradient_accumulation_steps: 그래디언트를 누적할 단계 수 (1보다 크면 그래디언트가 누적되어 더 큰 배치 크기로 학습하는 효과)
* optim: 옵티마이저 유형 "paged_adamw_32bit"
* save_steps: 모델 체크포인트를 저장할 단계 수
* logging_steps: 로그를 기록할 단계 수
* learning_rate: 학습률 (가중치를 업데이트하는 속도 결정)
* weight_decay: 가중치 감소 값 모델이 과적합되는 것을 방지하기 위해 가중치에 적용되는 정규화 값
* fp16: 16비트 부동소수점 연산을 사용할지 여부를 결정 (True :학습 속도가 빨라지고 메모리 사용량이 줄어듬)
* max_grad_norm: 그래디언트 클리핑 값 그래디언트 폭발을 방지하기 위해 그래디언트를 이 값으로 클리핑
* max_steps: 학습할 최대 단계 수, -1로 설정하면 모든 에포크를 완료할 때까지 학습
* warmup_ratio: 학습률 워밍업 비율 학습 초기에 학습률을 점진적으로 증가시키기 위한 비율
* group_by_length: 시퀀스 길이에 따라 배치를 그룹화할지 여부, True로 설정하면 시퀀스 길이에 따라 배치가 그룹화
* lr_scheduler_type: 학습률 스케줄러 유형 "constant"로 설정되어 학습률이 일정하게 유지
* report_to: 학습 로그를 보고할 플랫폼 여기서는 TensorBoard에 로그를 기록

##### SFTTrainer 설정

In [57]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


* train_dataset: 학습에 사용할 데이터셋
* peft_config: PEFT (Parameter-Efficient Fine-Tuning) 설정(peft_params 객체로 설정)
* dataset_text_field: 데이터셋에서 텍스트 필드의 이름
* max_seq_length: 시퀀스의 최대 길이
* tokenizer: 토크나이저입니다.
* args: 학습 설정 TrainingArguments 객체로 설정
* packing: 시퀀스 패킹 여부, 시퀀스 길이를 고정하거나 패딩하는 등의 작업을 의미

##### 모델 학습

In [58]:
trainer.train()

Step,Training Loss
25,4.0866
50,3.175
75,2.6156
100,2.7205
125,2.3146
150,1.8823
175,1.3722
200,1.5273
225,1.02
250,0.9421


TrainOutput(global_step=960, training_loss=0.7453132179876168, metrics={'train_runtime': 975.4333, 'train_samples_per_second': 0.984, 'train_steps_per_second': 0.984, 'total_flos': 2680000816250880.0, 'train_loss': 0.7453132179876168, 'epoch': 20.0})

##### 모델 테스트

In [63]:
logging.set_verbosity(logging.CRITICAL)

prompts = ["israel kills palestinians big gaza incursion reuters reuters israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip third time as many months quell palestinian rocket fire israel",
            "john kerry confident the iran deal will survive past obama plots his next big task authors the secretary state boldly predicts deal percent approval rating",
            "spain u s match expected set record ap ap this week s u s spain davis cup final expected break attendance record sanctioned tennis match",
            "cisco unveils new routers business chicago reuters cisco systems inc href biggest maker network gear directing internet traffic monday said introduce new line low end routers coming weeks aimed winning corporate business",]

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=150)

generated_text_list2 = []
labels = []

for prompt in prompts:
    result = pipe(f"[INST] {prompt} [/INST]")
    generated_text = result[0]['generated_text']
    label = generated_text.split("[/INST]")[1].strip().split()[0]

    generated_text_list2.append(generated_text)
    labels.append(label)

for label in labels:
  print(label)

world
politics
sports
tech


* result 객체에서 [/INST] 다음 단어로 사전 학습된 6가지 label을 출력
* 이후 다른 text 까지 생성하기때문에 원하는 결과만 출력

##### 생성된 데이터

In [64]:
generated_text_list2

['[INST] israel kills palestinians big gaza incursion reuters reuters israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip third time as many months quell palestinian rocket fire israel [/INST] world 08 05 2006 12 40 am et israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip for third time many months quell palestinian rocket fire israel said [/INST] world 08 05 2006 12 40 am et israeli forces killed three palestinians including two teenagers wednesday after storming northern gaza strip for third time many months quell palestinian rocket fire israel said hours earlier [/INST] world 08 05 2006 12 ',
 '[INST] john kerry confident the iran deal will survive past obama plots his next big task authors the secretary state boldly predicts deal percent approval rating [/INST] politics 2016 presidential campaign democrats dream team hobbit toad hall [/INST] politics 2016 presidentia

##### 모델 허깅페이스 업로드

In [66]:
# Repository 생성 & model upload
REPO_NAME = "llama-2-news"
AUTH_TOKEN = "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"

model.push_to_hub(
    REPO_NAME,
    use_temp_dir=True,
    token=AUTH_TOKEN
)
tokenizer.push_to_hub(
    REPO_NAME,
    use_temp_dir=True,
    token=AUTH_TOKEN
)
# 모델을 저장할 경로 설정
MODEL_PATH = "/content/drive/MyDrive/model/"

# 모델 저장
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

('/content/drive/MyDrive/model/tokenizer_config.json',
 '/content/drive/MyDrive/model/special_tokens_map.json',
 '/content/drive/MyDrive/model/tokenizer.model',
 '/content/drive/MyDrive/model/added_tokens.json',
 '/content/drive/MyDrive/model/tokenizer.json')