##### Torch, CUDA, cuDDN, 버전 확인

In [1]:
import torch

# CUDA 사용 가능 여부 확인
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
print("Torch version:{}".format(torch.__version__))
print("cuda version: {}".format(torch.version.cuda))
print("cudnn version:{}".format(torch.backends.cudnn.version()))

Using device: cuda
Torch version:2.3.0+cu121
cuda version: 12.1
cudnn version:8906


##### 필요 라이브러리 설치

In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

!pip install nltk

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.

* accelerate : hugging face 학습루프 가속화 라이브러리
* peft: LoRA, Prefix Tuning, P-Tuing, Prompt Tuning 과 같은 기법들을 쉽게 사용하도록 나온 라이브러리
* bitsandbytes: gpu 에서 모델을 손쉽게 압축할 수 있는 라이브러리
* trl: TRL (Transformer Reinforcement Learning) 은 transfomer언어 모델의 훈련을 위한 풀스택 라이브러리

In [3]:
import pandas as pd
import numpy as np
import random
import re

##### Google Dirve Mount

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/MyDrive/open/news.csv')

##### title 열과 contents 열을 연결하여 text열 생성

In [6]:
df['text'] = df['title'] + " :" + df['contents']

df['text'].head()

0    Spanish coach facing action in race row :MADRI...
1    Bruce Lee statue for divided city :In Bosnia, ...
2    Only Lovers Left Alive's Tilda Swinton Talks A...
3    Macromedia contributes to eBay Stores :Macrome...
4    Qualcomm plans to phone it in on cellular repa...
Name: text, dtype: object

##### NLTK에서 영어 불용어 제거

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
all_stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


##### Text Preprocessing 1


In [8]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 내용과 무관한 태그 제거
    text = re.sub(r'target=\/\S+','' , text)
    text = re.sub(r'&lt\;\S+', '', text)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    # 불용어 제거
    text = word_tokenize(text)
    tokens_without_sw = [word for word in text if not word in all_stopwords]
    text = (" ").join(tokens_without_sw)

    return text.lower()

Text Preprocessing 2(추가)

In [9]:
# 단일 문자 제거
def remove_single_char(text, threshold=1):
    words = word_tokenize(text)
    text = ' '.join([word for word in words if len(word) > threshold])
    return text

# 구두점 제거
def remove_punctuation(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

# 불필요한 공백 제거
def remove_extra_whitespaces(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

##### 전처리 적용

In [10]:
df['text'] = df['text'].apply(preprocess_text)
df['text'] = df['text'].apply(remove_single_char)
df['text'] = df['text'].apply(remove_punctuation)
df['text'] = df['text'].apply(remove_extra_whitespaces)
df.to_csv('./pre_df.csv')
print(df['text'].head())

0    spanish coach facing action race row madrid af...
1    bruce lee statue divided city in bosnia one ma...
2    only lovers left alive s tilda swinton talks a...
3    macromedia contributes ebay stores macromedia ...
4    qualcomm plans phone cellular repairs over the...
Name: text, dtype: object


###### huggingface hub 로그인

In [11]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(base_model,
                                          token=True,)

model = AutoModelForCausalLM.from_pretrained(base_model,
                                             device_map={"": 0},
                                             torch_dtype=torch.float16,
                                             token=True,
                                             load_in_8bit=True,
                                            #  load_in_4bit=True
                                             )

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [58]:
from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                do_sample=True,
                top_p=0, # top_p=0으로 설정한다면 확률분포 중 가장 높은 확률의 단어만 선택
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

* top_p : 소프트맥스를 통해 예측한 확률을 정렬(내림차순)한 후, 누적확률이 p 이하인 토큰만 선택하는 방식
* top_k : 소프트맥스를 통해 예측한 확률을 정렬(내림차순)한 후 상위 k개의 토큰만 선택하는 방식
* top_p, top_k -> **낮은 확률 값을 가진 토큰을 효과적으로 제거**

In [59]:
!nvidia-smi

Fri Jun 14 06:24:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   62C    P0              30W /  72W |  16731MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### In-Context Learning(few-shot Learning) by Prompt

##### Prompt 준비

In [60]:
ins = f"""
  For example:
  - [News]: "two investment banks settle sec washington reuters deutsche bank securities inc thomas weisel partners agreed pay combined million settle charges involving conflicts interest research investment banking u s regulators said thursday"
  - [Category]: "business"

  - [News]: "yukos chief plans return to russia the american chief executive yukos embattled oil producer whose top executives left russia week ago feared government prosecution said yesterday quot fully intends"
  - [Category]: "business

  - [News]: "oil prices soar supply shortage fears oil prices soared new highs sides atlantic today traders fretted increasingly short supplies the cost crude new york broke"
  - [Category]: "business

  - [News]: "report lehman near deal enron lawsuits reuters reuters lehman brothers holdings inc close settling class action lawsuit million stemming allegations colluded other brokerages mislead enron corp shareholders the wall street journal reported Thursday"
  - [Category]: "business

  - [News]: "dow jones agrees buy marketwatch million deal dow jones company publisher the wall street journal agreed buy marketwatch parent company financial news web site cbs marketwatch approximately million companies said today"
  - [Category]: "business

  - [News]: "sears sales slightly october apparel still slumping hoffman estates ill hoffman estates based sears roebuck company reported slight increase october same store sales today despite continuing weak results apparel"
  - [Category]: "business

  - [News]: "weighing outsourcing s impact key factors help determine outsourcing benefits hurts americans"
  - [Category]: "business

  - [News]: "adb president tadao chino resigns the president manila based asian development bank tuesday announced resignation multilateral bank effective next year"
  - [Category]: "business

  - [News]: "fran mires talks rayhein ala fein egypt s popular reality show diff authors bit skeptical ten years al hurra achieved great deal mena region grand part due presence fran mires program developer television executive network"
  - [Category]: "entertainment"

  - [News]: "lee create new film superhero comic book veteran stan lee team producer robert evans create movie featuring new superhero foreverman focus character face problems everyday life well using special powers save world paramount pictures studio behind film revealed details project say potential spawn series films lee best known work spider man the incredible hulk he collaborating script screenwriter peter briggs penned recent comic book adaptation hellboy we believe truly whole new franchise said gill champion president chief executive lee s pow entertainment in world people looking something different stan s idea create concept seen become evergreen franchise paramount many lee s creations including x men daredevil turned films past five years however spider man series biggest box office hit original sequel taking almost bn worldwide third spider man film scheduled release another marvel comics adaptation the fantastic four released cinemas summer"
  - [Category]: "entertainment"

  - [News]: "harry potter ip claim pinned beaches tour company shows warner brothers churchill spirit"
  - [Category]: "entertainment"

  - [News]: "rogue one dominates the holiday box office authors the star wars film took million christmas eve projected finish holiday monday total million domestically"
  - [Category]: "entertainment"

  - [News]: "vera drake leads uk oscar hopes mike leigh s film vera drake lead british hopes year s academy awards getting three nominations imelda staunton nominated best actress role abortion drama leigh received nods best director original screenplay kate winslet also nominated best actress category role eternal sunshine spotless mind and clive owen sophie okonedo got nominated supporting roles closer hotel rwanda respectively owen already made bookmakers favourite best supporting actor role closer already clinched golden globe award and first nomination actress okonedo chosen performance hotel rwanda rwandan genocide it also debut nomination staunton told bbc news thought film would appeal academy voters it extraordinary time making film ca n t believe happened morning said hope shows mike extraordinary filmmaker we also dealing difficult subject matter amazing accepted way leigh previously received three oscar nominations secrets lies topsy turvy told bbc news latest success amazing he said we hoped imelda staunton would get nomination never expected get director screenplay it s absolutely wonderful think people aware s life hope warmth compassion really talks people winslet said ecstatic fourth nomination career being nominated means much to nominated film released ago feel honoured overwhelmed said john woodward chief executive uk film council said extremely heartening see british filmmaking talent recognised global stage britain hugely talented industry nominations show national lottery investment film pays major dividends culture economy among total british nominees composer andrew lloyd webber lyricist charles hart best original song learn to be lonely the phantom opera movie cinematographer john mathieson nominated gladiator also the phantom opera and finding neverland garnered two nominations brits gemma jackson also worked bridget jones s diary iris art direction costume designer alexandra byrne whose previous films included captain corelli s mandolin elizabeth running the uk two contenders best live action short film category wasp made ex children s tv presenter andrea arnold little terrorist work ashvin kumar this year s awards handed hollywood february"
  - [Category]: "entertainment"

  - [News]: "disney takes sides battle next generation dvd hollywood movie powerhouse walt disney taken sides japans sony corp bitter battle studios define technical standard next generation dvds said"
  - [Category]: "entertainment"

  - [News]: "playing traumas war are games based vietnam conflict making us immune realities history"
  - [Category]: "entertainment"

  - [News]: "how lifetime s unreal tackles the princess fantasy authors the suitor prize youure maidens trying glass slipper"
  - [Category]: "entertainment"

  - [News]: "the clinton campaign will fight you on twitter authors now that s win election"
  - [Category]: "politics"

  - [News]: "did trump collude with russia or obstruct justice probably both authors just follow facts"
  - [Category]: "politics"

  - [News]: "trump is governing like traditional republican authors he may always sound like one actions line gop s agenda"
  - [Category]: "politics"

  - [News]: "kerry unfit lead amid great threats bush says reuters reuters president bush labeled sen john kerry saturday political opportunist unfit lead amid great threats america"
  - [Category]: "politics"

  - [News]: "another bush era staffer endorses hillary clinton authors the list keeps getting longer"
  - [Category]: "politics"

  - [News]: "lot of americans think abortion is just as bad as attacking abortion clinics authors most americans say colorado planned parenthood shooting act terrorism feel abortion providers better"
  - [Category]: "politics"

  - [News]: "democrats are n t quite ready for their primary to end authors bernie stay little bit longer"
  - [Category]: "politics"

  - [News]: "mike pence suggested not torturing terrorism suspects was like using oprah winfrey methods authors this guy s supposed prevent trump bringing back waterboarding"
  - [Category]: "politics"

  - [News]: "lionel messi says kobe bryant was the reason he got into basketball authors game recognize game"
  - [Category]: "sports"

  - [News]: "al wrap texas completes sweep oakland new york reuters david dellucci s two run double bottom ninth inning helped texas rangers complete three game sweep oakland athletics win american league arlington thursday"
  - [Category]: "sports"

  - [News]: "even brazilian icon rivaldo thinks we should stay away from rio authors youull putting life risk here ud"
  - [Category]: "sports"

  - [News]: "what s that baseball s back n t paid much attention baseball lately scintillating presidential campaign scintillating fall television season britney spears s scintillating latest marriage imagine surprise woke afternoon latest developments"
  - [Category]: "sports"

  - [News]: "germans secure place next champions trophy lahore two second half goals florian keller helped germany beat new zealand sunday pakistan national stadium city lahore guaranteeing place men field hockey champions trophy next year"
  - [Category]: "sports"

  - [News]: "double dip new york maybe seem mere whistling bronx pledges manager terry francona general manager theo epstein even boston s loss game red sox would somehow find way overcome possible loss curt schilling rest american league championship series"
  - [Category]: "sports"

  - [News]: "kick off whistle blows tampa bay leprechaun makes move manchester united directors barely disguised gritted teeth yesterday issued statement outside world awaiting months britain richest football club received possible takeover offer"
  - [Category]: "sports"

  - [News]: "champions league group roundup including tale it tale two penalties highbury arsenal could manage draw panathinaikos henry converted arsenal spot kick open scoring minutes basinas failed deliver"
  - [Category]: "sports"

  - [News]: "emc makes smb channel macintosh play with dantz acquisition emc scored triple play acquisition dantz development small developer data backup restore software retrospect brand"
  - [Category]: "tech"

  - [News]: "star wars peaceful life mars nasa gets the world dreaming if peace earth longer feasible end century may another option move mars yesterday head nasa surface exploration mission said find water red"
  - [Category]: "tech"

  - [News]: "nasa tries break speed record with last plane test update nasa test latest jet powered aircraft final time next week aiming set world record flying times speed sound"
  - [Category]: "tech"

  - [News]: "can microsoft stomp itunes with store its own microsoft aiming market apple computer pioneered year ago itunes online music store"
  - [Category]: "tech"

  - [News]: "gadget market to grow the explosion consumer technology continue delegates world s largest gadget show las vegas told the number gadgets shops predicted grow devices talk become increasingly important everything going digital kirsten pfeifer consumer electronics association told bbc news website the consumer electronics show ces featured pick s products consumers controlling want technologies like hdtvs high definition tvs digital radio digital cameras remain strong all products show really showed breadth depth industry despite showing diversity delegates attending complained showcase lacked much wow factor previous years the portable technologies show also reflected one buzzwords ces time place shifting multimedia content able watch listen video music anywhere time at start last year s ces cea predicted would average growth that figure surpassed rise popularity portable digital music players personal video recorders digital cameras it clear also gadgets becoming lot lifestyle choice fashion personalisation becoming increasingly key way gadgets designed part rise spending power generation x ers grown technology spending power desire devices suit more consumer electronics market made female buyers according cea research hybrid devices combine number multimedia functions also evidence show floor lot driven ability said stephen baker consumer electronics analyst retail research firm npd group some functions cost next nothing add as well show floor showcasing everything tiny wearable mp players giant high definition tvs several keynote speeches made industry leaders microsoft chief bill gates despite several embarrassing technical glitches mr gate s pre show speech announced several new partnerships mainly us market he unveiled new ways letting people take tv shows recorded personal video recorders watch back portable devices he disappointed however failing announce details next generation xbox games console another disappointment lack exposure sony s new portable games device psp show sony said much anticipated gadget would likely start shipping march us europe it went sale japan christmas there two psps embedded glass cabinets show though representatives discuss details sony representative told bbc news website sony consider part consumer technology offering elsewhere show plethora colour plasma screens including samsung s inch metre plasma largest world industry experts also excited high definition technologies coming fore new formats dvds coming hold six times much data conventional dvds with many devices move lot products show offering external storage like seagate s gb pocket sized external hard drive innovation engineering design prize more trade professionals attended ces las vegas officially ran january"
  - [Category]: "tech"

  - [News]: "intel appoints otellini ceo intel world largest computer chip maker announced thurs day directors confirmed selection company president paul otellini chief executive"
  - [Category]: "tech"

  - [News]: "hm wonder what mark zuckerberg us up to on facebook right now authors we re watching mark"
  - [Category]: "tech"

  - [News]: "microsoft msn desktop search is out in beta microsoft msn desktop search is out in beta as expected microsoft finally released beta desktop search software it integrated browser toolbar set labeled msn toolbar suite beta the download size huge compared google desktop search megs would"
  - [Category]: "tech"

  - [News]: "israeli fire kills girl gaza palestinian girl reported killed israeli fire hours mortars aimed jewish enclave hurt four"
  - [LabeCategoryls]: "world"

  - [News]: "roundup abuja talks winding road peace darfur lagos sept xinhuanetby dai adi lin xiaochun the road sustainable peace war ravaged darfur region western sudan might long tortuous one on going african union au sponsored"
  - [Category]: "world"

  - [News]: "new brussels blow turkey eu hopes eu farm commissioner franz fischler friday became latest brussels critic raise doubts turkey hopes joining bloc wrangling ankara eu bid heats"
  - [Category]: "world"

  - [News]: "hobbit find raises debate humanity sydney the discovery skeleton female barely one metre tall hunted pygmy elephants giant rats years ago could force reassessment origins humanity scientists australia said yesterday"
  - [Category]: "world"

  - [News]: "climate researchers toast vintners grape harvest records while oenophiles may pore grape harvest records search perfect vintage wine team french scientists historians raising glasses toast insight records yield past climate"
  - [Category]: "world"

  - [News]: "this week guantanamo detainees arraigned guantanamo bay naval base cuba the first guantanamo detainees arraigned terrorism charges alleged al qaeda accountant poet accused crafting terrorist propaganda man accused driver osama bin laden"
  - [Category]: "world"

  - [News]: "blair was warned post war iraq chaos paper london reuters britain s foreign secretary senior officials warned prime minister tony blair year invading iraq chaos could follow toppling saddam hussein newspaper said saturday"
  - [Category]: "world"

  - [News]: "dutch raid kurdish training camps arrest amsterdam reuters dutch authorities raided suspected training camp kurdistan workers party pkk guerrilla group southern netherlands arrested people prosecutors said friday"
  - [Category]: "world"

  Given the following news, provide the appropriate category:

  - [News]: "john kerry confident the iran deal will survive past obama plots his next big task authors the secretary state boldly predicts deal percent approval rating"
  - [Category]:
  """

#  "john kerry confident the iran deal will survive past obama plots his next big task authors the secretary state boldly predicts deal percent approval rating"
#   "spain u s match expected set record ap ap this week s u s spain davis cup final expected break attendance record sanctioned tennis match"
#   "cisco unveils new routers business chicago reuters cisco systems inc href biggest maker network gear directing internet traffic monday said introduce new line low end routers coming weeks aimed winning corporate business"

In [61]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

DEFAULT_SYSTEM_PROMPT = """
You are an assistant that categorizes news articles. Your answers should only be in the categories of Business, Entertainment, Politics, Sport, Tech, and World."""

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

def get_prompt(instruction):
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")

def generate(text):
  prompt = get_prompt(text)
  with torch.autocast('cuda', dtype=torch.bfloat16):
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs,
                              max_new_tokens=512,
                              eos_token_id=tokenizer.eos_token_id,
                              pad_token_id=tokenizer.eos_token_id,
                              )

    final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    final_outputs = cut_off_text(final_outputs, '</s>')
    final_outputs = remove_substring(final_outputs, prompt)

  return final_outputs

def parse_text(text):
  wrapped_text = textwrap.fill(text, width=100)
  print(wrapped_text + "\n\n")

```</s>``` 텍스트 시퀀스의 끝을 나타내는 특별한 토큰(모델이 생성한 출력의 끝을 구분)

In [62]:
%%time
generated_text = generate(ins)
parse_text(generated_text)

  Based on the given news, the appropriate category is "Politics".


CPU times: user 6.81 s, sys: 9.21 ms, total: 6.82 s
Wall time: 6.8 s
