In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:99% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:0px;}
div.CodeMirror {font-family:Consolas; font-size:24pt;}
div.text_cell_render.rendered_html{font-size:20pt;}
div.text_cell_render ul li, div.text_cell_render ol li p, code{font-size:22pt; line-height:30px;}
div.output {font-size:24pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:24pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:24pt;padding:5px;}
table.dataframe{font-size:24px;}
</style>
"""))

In [8]:
import warnings
import os
import logging
# 경고 제거
warnings.filterwarnings('ignore')

# transformers 로깅 레벨 조정
logging.getLogger("transformers").setLevel(logging.ERROR)

# Hugging Face symlink 경고 제거
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# from transformers import pipeline, logging as hf_logging
# hf_logging.set_verbosity_error()

# <span style="color:red">ch1. 허깅페이스</span>
- Inference API 이용 : 모델의 결과를 surver에서
- pipeline()이용 : 모델을 다운로드받아 모델의 결과를 local에서
    * raw text -> tokenizer -> model -> [0.11, 0.55, 0.xx, ~] logits값으로 prediction 결과 출력
```
허깅페이스 transformers에서 지원하는 task
"sentiment-analysis" : "text-classification"의 별칭(감정분석 전용으로 사용)
"text-classification" : 감정분석, 뉴스분류, 리뷰 분류 등 일반적인 문장 분류
"zero-shot-classification" : 레이블을 학습 없이 주어진 후보군 중에서 분류
"token-classification" : 개체명 인식(NER : Named Entity Recognition) 등 단위 라벨링
"ner" : "token-classification"의 별칭
"fill-mask": 빈칸 채우기 
"text-generation" : 텍스트 생성 (GPT류 모델에 사용)
"text2text-generation" : 번역, 요약 등 입력 -> 출력 변환
"translation" : 번역
"summerization" : 텍스트요약
"question-answering" : 주어진 context를 보고 질문에 답하기.
"image-to-text" : 그림을 설명
"image-classification" : 이미지 분류

```

## 1. 텍스트 기반 감정분석(긍정/부정)
- C:\Users\Admin\.cache\huggingface\hub

In [10]:
from transformers import pipeline
classifier = pipeline(task="sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9333938956260681}]

In [12]:
classifier = pipeline(task="text-classification",
                     model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# 감정분석시 내용이 많으면 list로
classifier([
    "I've been waiting for a HuggingFace course my whole life."
    "I hate this so much!"
])

[{'label': 'NEGATIVE', 'score': 0.9982663989067078}]

In [13]:
classifier("이 영화 정말 최고였어요. 감동적이고 연기가 대단해")

[{'label': 'POSITIVE', 'score': 0.857815682888031}]

In [14]:
classifier("이 물건 정말 사고 싶어요")

[{'label': 'POSITIVE', 'score': 0.8577604293823242}]

In [18]:
classifier(["I like you", "I hate you", "나 너가 싫어", ""])

[{'label': 'POSITIVE', 'score': 0.9998695850372314},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079},
 {'label': 'NEGATIVE', 'score': 0.599323034286499},
 {'label': 'POSITIVE', 'score': 0.5370621085166931}]

In [22]:
classifier = pipeline(task = "sentiment-analysis",
                     model="matthewburke/korean_sentiment")
texts = ['나는 너가 좋아', "당신이 싫어요", "힘들어요","오늘 기분이 최고야"]

result = classifier(texts)

In [23]:
result

[{'label': 'LABEL_1', 'score': 0.9557897448539734},
 {'label': 'LABEL_0', 'score': 0.9092598557472229},
 {'label': 'LABEL_0', 'score': 0.9140233397483826},
 {'label': 'LABEL_1', 'score': 0.9714491367340088}]

In [29]:
from transformers import pipeline
classifier = pipeline(task="sentiment-analysis",
                     model="matthewburke/korean_sentiment")
classifier(['나는 너가 좋아', "당신이 싫어요", "힘들어요", "오늘 기분이 최고야"])

[{'label': 'LABEL_1', 'score': 0.9557897448539734},
 {'label': 'LABEL_0', 'score': 0.9092598557472229},
 {'label': 'LABEL_0', 'score': 0.9140233397483826},
 {'label': 'LABEL_1', 'score': 0.9714491367340088}]

## 2. 제로샷분류(Zero-shot분류)
- 기계학습 및 자연어처리에서 각 개별 작업에 대한 특정 교육없이 작업을 수행할 수 있는 모형(비지도학습)

In [32]:
classifier = pipeline("zero-shot-classification")

classifier(
    "I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.5227580070495605,
  0.45814019441604614,
  0.0142647260800004,
  0.0026850001886487007,
  0.002152054337784648]}

In [33]:
sequence_to_classify = "One day i will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']
classifier(sequence_to_classify, candidate_labels)

{'sequence': 'One day i will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9939888715744019, 0.0032361410558223724, 0.002775018336251378]}

# 3. text 생성


In [40]:

from transformers import pipeline, set_seed
# set_seed(2)
generation = pipeline("text-generation", "gpt2") # 텍스트 생성 gpt3부터는 허깅페이스없음
generation(
    "in this course. We will teach you how to",
    pad_token_id=generation.tokenizer.eos_token_id
) # pad_token_id 경고를 없애려고 setting

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[{'generated_text': 'in this course. We will teach you how to create and sell your own digital products, to build a business where you can sell your own products, and to learn how to grow your business.\n\nWe will teach you how to create and sell your own digital products, to build a business where you can sell your own products, and to learn how to grow your business. We will provide you with the tools you need to become a successful entrepreneur. You will learn the basic principles of how to build a business and how to get it done.\n\nYou will learn the basic principles of how to build a business and how to get it done. You will learn the basics of how to build a business and how to get it done. We will show you how to create a business from the ground up and how you can create a business from the ground up and how you can build a company.\n\nWe will show you how to create a business from the ground up and how you can create a company from the ground up and how you can build a compan

In [45]:
from transformers import pipeline
generation = pipeline("text-generation", "skt/kogpt2-base-v2")
result = generation(
    "이 과정은 다음과 같은 방법을 알려드려요. ",
    pad_token_id = generation.tokenizer.eos_token_id,
    max_new_tokens = 100, # 생성할 최대 길이(생성할 토큰 수)
    num_return_sequences=1, # 생성할 문장 갯수
    do_sample=True,          # 다양할 샘플 사용
    top_k = 50,         # top-k 샘플링(확률 높은 상위 50개 토큰만 사용)
    top_p = 0.95,
    temperature = 1.2,           # 창의성 조절(낮을 수록 보수적)
    no_repeat_ngram_szie = 2 # 반복 방지
)
print(result[0]['generated_text'])

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

ValueError: The following `model_kwargs` are not used by the model: ['no_repeat_ngram_szie'] (note: typos in the generate arguments will also show up in this list)

## 4. 마스크(빈칸)채우기

In [51]:
unmasker = pipeline(task='fill-mask',
                   model='distilbert/distilroberta-base') # 마스크 채우기
unmasker("I'm going to hospital and meet a <mask>", top_k=2) # top_k 기본값 5

[{'score': 0.19275707006454468,
  'token': 3299,
  'token_str': ' doctor',
  'sequence': "I'm going to hospital and meet a doctor"},
 {'score': 0.06794589757919312,
  'token': 27321,
  'token_str': ' psychiatrist',
  'sequence': "I'm going to hospital and meet a psychiatrist"}]

In [52]:

unmasker("Hello, I'm a <mask> model.")

[{'score': 0.0629730075597763,
  'token': 265,
  'token_str': ' business',
  'sequence': "Hello, I'm a business model."},
 {'score': 0.038101598620414734,
  'token': 18150,
  'token_str': ' freelance',
  'sequence': "Hello, I'm a freelance model."},
 {'score': 0.03764132782816887,
  'token': 774,
  'token_str': ' role',
  'sequence': "Hello, I'm a role model."},
 {'score': 0.037326786667108536,
  'token': 2734,
  'token_str': ' fashion',
  'sequence': "Hello, I'm a fashion model."},
 {'score': 0.026023676618933678,
  'token': 24526,
  'token_str': ' Playboy',
  'sequence': "Hello, I'm a Playboy model."}]

In [53]:

unmasker("안녕하세요? 나는 <mask> 모델이예요.", top_k=3)

[{'score': 0.14130638539791107,
  'token': 35,
  'token_str': ':',
  'sequence': '안녕하세요? 나는: 모델이예요.'},
 {'score': 0.1223798543214798,
  'token': 116,
  'token_str': '?',
  'sequence': '안녕하세요? 나는? 모델이예요.'},
 {'score': 0.08188082277774811,
  'token': 328,
  'token_str': '!',
  'sequence': '안녕하세요? 나는! 모델이예요.'}]

 ## InferenceAPI 사용
 

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
# os.environ['HF_TOKEN']

True