## Chapter 01. Language Model

### Language Model

#### 1. Vector

In [1]:
import numpy as np
a = np.array([1, 2, 3]) # 리스트 형태로 벡터 생성, 파이썬의 리스트를 numpy의 array(배열)로 변환
a # numpy 내부적으로 벡터나 행렬을 효율적으로 처리하기 위해 만든 자료 구조

array([1, 2, 3])

In [2]:
b = np.array([5, 6, 7])

In [3]:
a * b # 단순 곱 = 배열 위치에 따른 짝지은 곱

array([ 5, 12, 21])

In [4]:
np.dot(a, b) # dot product, 곱한 수의 합계

38

In [5]:
a @ b # 새로운 연산자(dot product)

38

In [6]:
np.einsum('i,i', a, b) # einstein sum

38

#### 2. Matrix

In [7]:
c = np.array([[1, 2, 3], [4, 5, 6]]) # 리스트 형태로 i번째 행을 표현, 하나의 행을 리스트로 표현
c

array([[1, 2, 3],
       [4, 5, 6]])

In [8]:
d = np.array([[1, 2], [3, 4], [5, 6]]) # c가 3열이므로, d는 무조건 3행이 와야함
d

array([[1, 2],
       [3, 4],
       [5, 6]])

In [9]:
np.dot(c, d)

array([[22, 28],
       [49, 64]])

In [10]:
c @ d

array([[22, 28],
       [49, 64]])

In [11]:
c[0, :] # 첫 번째 행

array([1, 2, 3])

In [12]:
d[:, 0] # 첫 번째 열

array([1, 3, 5])

In [13]:
c[0, :] @ d[:, 0] # 첫 번째 행과 첫 번째 열의 곱, 1k와 k1의 곱

22

In [14]:
np.einsum('ik,kj->ij', c, d)

array([[22, 28],
       [49, 64]])

### Transformers

In [15]:
# !pip install transformers

In [16]:
from transformers import pipeline # 특정한 종류의 자연어 처리 작업을 간단히 할 수 있도록 지원
classifier = pipeline('sentiment-analysis') 
# 트랜스포머 모형을 이용한 감성분석, 하고자하는 과제의 종류를 파이프라인에 넣어 과제에 맞는 학습된 모형을 다운받음

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [17]:
classifier('I am glad to hear that you finally made it.') #감성분석: 긍정

[{'label': 'POSITIVE', 'score': 0.9998080134391785}]

In [18]:
classifier('I am sorry that you hate it.') #감성분석: 부정

[{'label': 'NEGATIVE', 'score': 0.9981151819229126}]

In [19]:
classifier = pipeline('sentiment-analysis', model="monologg/koelectra-small-finetuned-nsmc") # 한국어 가능 모델 적용
# 한국어 영화평을 기준으로 감성분석이 실행되어 있는 모델 

In [20]:
classifier('이 영화 진짜 재밌다')

[{'label': 'positive', 'score': 0.9803304076194763}]

In [21]:
classifier('이야기가 말이 안된다')

[{'label': 'negative', 'score': 0.8447047472000122}]

### Tokenizers

In [22]:
# !pip install tokenizers

In [23]:
# !wget -c https://github.com/songys/Chatbot_data/raw/master/ChatbotData%20.csv -O chatbot.csv

In [24]:
import pandas as pd
df = pd.read_csv('./data/chatbot.csv')
df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [25]:
with open('./data/sample.txt','w',encoding='utf-8') as f: # sample.txt 라는 이름으로 저장
  for row in df.itertuples():
    f.write(row.Q + '\n')
    f.write(row.A + '\n')

In [26]:
# !head sample.txt

In [27]:
from tokenizers import CharBPETokenizer # 글자 단위의 BPE 토큰화
bpe = CharBPETokenizer(lowercase=True) # 대문자를 소문자로 바꾸는 옵션

In [28]:
# 어휘를 학습하는 함수 
# 파일이 여러 개라면 리스트 형태로 제시 
# min_frequency = 최소 몇 번 나와야 어휘에 포함
# vocab_size = 몇 개의 어휘를 학습할 것인가
bpe.train(files='./data/sample.txt',min_frequency=1, vocab_size=5000) 

In [29]:
enc = bpe.encode('자연어 처리는 재밌다!')
enc.ids #텍스트를 토큰화하여 고유번호로 변환

[2206, 1021, 797, 1875, 2251, 1038, 1540]

In [30]:
enc.tokens #고유번호의 글자 확인

['자연', '어</w>', '처', '리는</w>', '재밌', '다</w>', '!</w>']

In [31]:
import unicodedata
with open('decomposed.txt','w',encoding='utf-8') as f:
  for row in df.itertuples():
    q = unicodedata.normalize('NFD', row.Q)
    f.write(q + '\n')
    a = unicodedata.normalize('NFD', row.A)
    f.write(a + '\n')

In [32]:
# !head decomposed.txt 
# 윈도우 메모장에선 풀어쓰기 되어있음

In [33]:
bpe = CharBPETokenizer(lowercase=True)
bpe.train(files='decomposed.txt',min_frequency=1,vocab_size=5000)

In [34]:
text = unicodedata.normalize('NFD','자연어 처리는 재밌다!') # 풀어쓰기로 학습시켰으므로, 토큰화 시킬 때도 풀어쓰기해줘야함
enc = bpe.encode(text)

In [35]:
enc.ids

[1290, 204, 299, 1547, 819, 1365, 162]

In [36]:
enc.tokens

['자연', '어</w>', '처', '리는</w>', '재미', 'ᆻ다</w>', '!</w>']

In [37]:
from tokenizers import ByteLevelBPETokenizer #바이트단위 토큰화
byte = ByteLevelBPETokenizer(lowercase=True)
byte.train(files='./data/sample.txt', min_frequency=1, vocab_size=5000)

In [38]:
enc = byte.encode('자연어 처리는 재밌다!')
enc.ids

[2500, 273, 3488, 99, 1056, 2276, 294, 0]

In [39]:
enc.tokens # 한글은 여러 개의 바이트로 되어있기 때문에 바이트 단위로 쪼개지면서 한글이 아닌 것처럼 보임

['ìŀĲìĹ°', 'ìĸ´', 'Ġì²ĺë', '¦', '¬ëĬĶ', 'Ġìŀ¬ë°Į', 'ëĭ¤', '!']

In [40]:
from tokenizers import BertWordPieceTokenizer # wordpiece model
wp = BertWordPieceTokenizer(lowercase=True)

In [41]:
wp.train(files='./data/sample.txt',min_frequency=1,vocab_size=5000)
enc = wp.encode('자연어 처리는 재밌다!')

In [42]:
enc.ids

[1379, 201, 1014, 1028, 3349, 216, 5]

In [43]:
enc.tokens # ## <- 앞에 말에 이어서 쓰라는 의미, BPE는 끝나는 지점을 표시하고, WordPiece는 이어지는 부분을 표시

['자연', '##어', '처', '##리는', '재밌', '##다', '!']

In [44]:
from tokenizers import SentencePieceBPETokenizer
sp = SentencePieceBPETokenizer() # SentencePiece는 대문자를 소문자로 변화하는 옵션이 없음
sp.train('./data/sample.txt',min_frequency=1,vocab_size=5000)

In [45]:
enc= sp.encode('자연어 처리는 재밌다!')
enc.ids

[1888, 617, 1543, 1397, 1944, 225, 3]

In [46]:
enc.tokens

['▁자연', '어', '▁처', '리는', '▁재밌', '다', '!']

### Softmax Func

In [47]:
import tensorflow as tf

tf.nn.softmax([-1.0, 0.0, 1.0])
# softmax 함수는 여러 개의 로짓을 입력 받아, 확률로 바꿔주는 함수이다. 로짓은 실수형으로 주어져야 함
# 로짓이 상대적으로 크면 확률이 높다

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.09003057, 0.24472848, 0.66524094], dtype=float32)>

In [48]:
tf.exp([-1.0, 0.0, 1.0]) / tf.reduce_sum(tf.exp([-1.0, 0.0, 1.0]))
# softmax 함수는 지수함수를 바탕으로 둠, 지수함수는 -입력값을 받아도 +출력값을 반환하며 값의 대소에 영향을 받는다
# softmax 함수의 계산식 (reudce_sum = 총합)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.09003058, 0.24472848, 0.66524094], dtype=float32)>

In [49]:
tf.nn.sigmoid(2.0)
# sigmoid는 하나의 값만 가지며, 로짓을 입력받아 확률을 반환한다
# softmax에서 특수한 이진분류 케이스일 때 사용한다

<tf.Tensor: shape=(), dtype=float32, numpy=0.8807971>

In [50]:
tf.nn.softmax([0.0, 2.0]) 
# softmax는 한 쪽에 0.0으로 고정시키고, 한 쪽만 변환시키는 값과 sigmoid와 동일

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.11920291, 0.880797  ], dtype=float32)>

### Transformer Pipeline

#### 언어 모형으로 다음 단어 예측

In [51]:
import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer
# TFAutoModelForCausalLM: LM, 언어 모형, 앞의 맥락을 통해 뒤의 내용을 예측하는 모형
# AutoTkenizer: 미리 학습된 토크나이저를 불러올 때 사용하는 class

In [52]:
# 모형 다운로드, 사전학습된 tokenizer 호출
# 가장 작은 base 모델 사용, 크기가 클수록 용량도 크지만 성능은 좋아짐
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased")

All model checkpoint layers were used when initializing TFXLNetLMHeadModel.

All the layers of TFXLNetLMHeadModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetLMHeadModel for predictions without further training.


In [53]:
# 호출한 tokenizer의 어휘 수
tokenizer.vocab_size

32000

In [54]:
# 호출한 tokenizer의 단어 목록
vocab = tokenizer.get_vocab()
id2word = {i: word for word, i in vocab.items()} # 번호를 key로 하는 dictionary 형성, 번호를 통해 검색 가능

In [55]:
vocab # 영어로 학습된 모델, 토큰 : 토큰 번호
# vocab.items, 토큰과 토큰 번호를 짝을 지어줌 

{'tail': 9424,
 'valu': 18425,
 '▁quarterfinal': 23636,
 '▁carved': 11410,
 '▁Tre': 5738,
 '▁outlines': 20041,
 '▁populace': 28555,
 '▁settlement': 2525,
 '▁truck': 2801,
 '▁Apr': 21057,
 '▁Barr': 9353,
 'utter': 9035,
 '▁Consulting': 22758,
 'bli': 17704,
 '▁Jesuit': 21263,
 '▁Infrastructure': 23729,
 '▁boomers': 26990,
 '▁brewery': 27120,
 '▁Lim': 13947,
 '▁Look': 6504,
 '▁separatist': 10685,
 '▁even': 176,
 '▁certificates': 14561,
 '▁clothing': 4688,
 '▁be': 39,
 '▁stunt': 14838,
 'Americans': 15052,
 '▁Secondary': 17239,
 'goal': 19464,
 '▁Stefan': 10918,
 '▁competence': 21073,
 '▁resembling': 24248,
 '▁ambient': 26566,
 'negative': 25976,
 '▁Bart': 10543,
 '▁refrain': 14333,
 '▁Tele': 9690,
 '▁magnitude': 10270,
 '▁ousted': 10787,
 '▁decorating': 23653,
 'equity': 18265,
 'concern': 29339,
 '▁EDI': 30700,
 '▁jurist': 30908,
 '▁excellent': 2712,
 'cran': 16354,
 '▁reject': 8006,
 'ker': 2261,
 '▁key': 792,
 '▁Avalanche': 27749,
 '▁Roof': 29072,
 '▁feminist': 16458,
 '▁wildly': 1685

In [56]:
id2word

{9424: 'tail',
 18425: 'valu',
 23636: '▁quarterfinal',
 11410: '▁carved',
 5738: '▁Tre',
 20041: '▁outlines',
 28555: '▁populace',
 2525: '▁settlement',
 2801: '▁truck',
 21057: '▁Apr',
 9353: '▁Barr',
 9035: 'utter',
 22758: '▁Consulting',
 17704: 'bli',
 21263: '▁Jesuit',
 23729: '▁Infrastructure',
 26990: '▁boomers',
 27120: '▁brewery',
 13947: '▁Lim',
 6504: '▁Look',
 10685: '▁separatist',
 176: '▁even',
 14561: '▁certificates',
 4688: '▁clothing',
 39: '▁be',
 14838: '▁stunt',
 15052: 'Americans',
 17239: '▁Secondary',
 19464: 'goal',
 10918: '▁Stefan',
 21073: '▁competence',
 24248: '▁resembling',
 26566: '▁ambient',
 25976: 'negative',
 10543: '▁Bart',
 14333: '▁refrain',
 9690: '▁Tele',
 10270: '▁magnitude',
 10787: '▁ousted',
 23653: '▁decorating',
 18265: 'equity',
 29339: 'concern',
 30700: '▁EDI',
 30908: '▁jurist',
 2712: '▁excellent',
 16354: 'cran',
 8006: '▁reject',
 2261: 'ker',
 792: '▁key',
 27749: '▁Avalanche',
 29072: '▁Roof',
 16458: '▁feminist',
 16856: '▁wildly

In [57]:
# 문장 만들기
sequence = f"Once upon a time, there was "

In [58]:
# 토큰화
input_ids = tokenizer.encode(sequence, return_tensors="tf") 
#앞에선 tokenizer를 했지만 지금은 encode로 문장을 토큰화하고, 토큰의 번호를 넘겨줌
#return_tensors 옵션을 통해 tensorflow에서 활용하는 tensor 형태로 데이터 반환환

In [59]:
# 토큰 아이디
input_ids

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[1977,  975,   24,   92,   19,  105,   30,    4,    3]])>

In [60]:
# 토큰화된 문장을 모형에 입력
result = model(input_ids)

In [61]:
# 로짓, result의 첫 번째 값
# logits를 softmax 함수에 입력하면 각각의 32000개의 단어에 대한 확률이 나옴
logits = result[0]

In [62]:
logits.shape # 1: 문장 하나 넣어서 출력 하나, 9: 토큰 아홉 개와 그에 대한 확률

TensorShape([1, 9, 32000])

In [63]:
# 마지막 단어 뒤에 나올 토큰의 확률
next_token_logits = logits[:, -1, :]

In [64]:
next_token_logits

<tf.Tensor: shape=(1, 32000), dtype=float32, numpy=
array([[-14.362142, -31.215897, -30.87506 , ..., -23.4436  , -22.54225 ,
        -30.336617]], dtype=float32)>

In [65]:
# softmax 함수에 넣으면 확률형태로 나옴
tf.nn.softmax(next_token_logits)

<tf.Tensor: shape=(1, 32000), dtype=float32, numpy=
array([[1.5699778e-05, 7.5231596e-13, 1.0578500e-12, ..., 1.7859362e-09,
        4.3986286e-09, 1.8124531e-12]], dtype=float32)>

In [66]:
top = tf.math.top_k(next_token_logits, k=10) # k개의 가장 큰값을 도출

In [67]:
for i in top.indices[0].numpy().tolist():
    #단어 번호만 출력 #tensor 형식에서 numpy array 형식으로 바꾸고 다시 list 형태로 변환
    print(id2word[i])

▁or
▁
,
d
.
▁ever
▁a
▁just
▁was
▁and


#### 마스크 언어 모형으로 문장 중간의 빈칸 예측

In [68]:
# 마스크 언어 모형 파이프라인 초기화
from transformers import pipeline
# 모형 다운로드
nlp = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
# 마스크 토큰 = 문장의 빈칸을 나타내는 토큰
nlp.tokenizer.mask_token

'<mask>'

In [70]:
# 파이프라인에 문장 입력
nlp(f"Pizza is my {nlp.tokenizer.mask_token} food") 
# <mask>로 작성해도 됨 
# 빈 칸에 들어갈 토큰을 알려줌

[{'score': 0.41411080956459045,
  'token': 2674,
  'token_str': ' favorite',
  'sequence': 'Pizza is my favorite food'},
 {'score': 0.31080371141433716,
  'token': 5863,
  'token_str': ' comfort',
  'sequence': 'Pizza is my comfort food'},
 {'score': 0.13814868032932281,
  'token': 5548,
  'token_str': ' favourite',
  'sequence': 'Pizza is my favourite food'},
 {'score': 0.015286203473806381,
  'token': 3366,
  'token_str': ' dream',
  'sequence': 'Pizza is my dream food'},
 {'score': 0.013527827337384224,
  'token': 6543,
  'token_str': ' signature',
  'sequence': 'Pizza is my signature food'}]

In [71]:
# 빈 칸(마스크)가 2개 이상인 경우 직접 모형을 만들어야 한다 (pipeline 사용 X)
from transformers import TFAutoModelForMaskedLM, AutoTokenizer

In [72]:
# 모형 다운로드
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")

All PyTorch model weights were used when initializing TFRobertaForMaskedLM.

All the weights of TFRobertaForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.


In [73]:
# 토크나이저의 어휘 목록 호출 -> 단어-번호의 형태에서 번호-단어의 형태로 변환
vocab = tokenizer.get_vocab()
id2word = {i: word for word, i in vocab.items()}

In [74]:
# 문장 입력
sequence = f"Pizza is my {tokenizer.mask_token} food." # f-string

In [75]:
# 토큰화, 모델에 넣기 전 인코딩
input_ids = tokenizer.encode(sequence, return_tensors="tf")

In [76]:
# 빈 칸의 위치 찾기
mask_token_indices = tf.where(input_ids[0] == tokenizer.mask_token_id)[0].numpy().tolist()
# tokenizer.mask_token_id = 빈 칸을 나타내는 토큰 번호
# tensor.flow의 where함수를 통해 빈 칸의 위치를 찾음

In [77]:
# 모형에 입력
result = model(input_ids)

In [78]:
# 로짓
logits = result[0]

In [79]:
# 빈 칸 위치
i = mask_token_indices[0]
i

5

In [80]:
# 빈 칸의 로짓, 9개 토큰 중에 5번째 토큰에 들어갈 로짓값
mask_token_logits = logits[0, i, :]

In [81]:
# 가장 로짓이 높은 토큰 10개
top = tf.math.top_k(mask_token_logits, k=10)

In [82]:
# 출력
for i in top.indices.numpy().tolist():
    print(id2word[i])

Ġfavorite
Ġcomfort
Ġfavourite
Ġsignature
Ġdream
Ġpreferred
Ġpassion
Ġstaple
Ġbreakfast
Ġeveryday


#### 빈칸이 2개인 경우

In [83]:
sequence = f"Pizza {tokenizer.mask_token} my {tokenizer.mask_token} food."

In [84]:
input_ids = tokenizer.encode(sequence, return_tensors="tf")

In [85]:
mask_token_indices = tf.where(input_ids[0] == tokenizer.mask_token_id)
mask_token_indices = tf.squeeze(mask_token_indices).numpy().tolist() #squeeze 함수를 통해 indices의 형태를 간략하게 바꿈
mask_token_indices

[3, 5]

In [86]:
result = model(input_ids)
logits = result[0]
for i in mask_token_indices:
    print(f'=== {i} ===')
    mask_token_logits = logits[0, i, :]
    top = tf.math.top_k(mask_token_logits, k=10)
    for i in top.indices.numpy().tolist():
        print(id2word[i])

=== 3 ===
Ġis
Ġwas
Ġas
Ġbecomes
Ġequals
Ġfor
Ġdelivers
Ġmakes
Ġand
Ġwith
=== 5 ===
Ġfavorite
Ġcomfort
Ġfavourite
Ġjunk
Ġbreakfast
Ġlunch
Ġown
Ġsnack
Ġfried
Ġpreferred


#### 기타 파이프라인

In [87]:
# tokenizers의 pipeline의 여러 가지 기능
# 질문 답변
qa = pipeline("question-answering") #모델 형성

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [88]:
# 지문
context = """
Seoul, officially the Seoul Special City, is the capital and largest metropolis 
of South Korea. Seoul has a population of 9.7 million people, and forms 
the heart of the Seoul Capital Area with the surrounding Incheon metropolis and 
Gyeonggi province.
"""

In [89]:
qa(question="where is the capital city of South Korea?", context=context)

{'score': 0.862982451915741, 'start': 1, 'end': 6, 'answer': 'Seoul'}

In [90]:
qa(question="How many people live in Seoul?", context=context)

{'score': 0.9299284219741821,
 'start': 124,
 'end': 135,
 'answer': '9.7 million'}

In [91]:
# 고유명사 인식
ner = pipeline("ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [92]:
ner(context)

[{'entity': 'I-LOC',
  'score': 0.9995671,
  'index': 1,
  'word': 'Seoul',
  'start': 1,
  'end': 6},
 {'entity': 'I-LOC',
  'score': 0.9981558,
  'index': 5,
  'word': 'Seoul',
  'start': 23,
  'end': 28},
 {'entity': 'I-LOC',
  'score': 0.97723013,
  'index': 6,
  'word': 'Special',
  'start': 29,
  'end': 36},
 {'entity': 'I-LOC',
  'score': 0.98950726,
  'index': 7,
  'word': 'City',
  'start': 37,
  'end': 41},
 {'entity': 'I-LOC',
  'score': 0.9969215,
  'index': 17,
  'word': 'South',
  'start': 85,
  'end': 90},
 {'entity': 'I-LOC',
  'score': 0.9992411,
  'index': 18,
  'word': 'Korea',
  'start': 91,
  'end': 96},
 {'entity': 'I-LOC',
  'score': 0.999453,
  'index': 20,
  'word': 'Seoul',
  'start': 98,
  'end': 103},
 {'entity': 'I-LOC',
  'score': 0.98211783,
  'index': 37,
  'word': 'Seoul',
  'start': 172,
  'end': 177},
 {'entity': 'I-LOC',
  'score': 0.9603081,
  'index': 38,
  'word': 'Capital',
  'start': 178,
  'end': 185},
 {'entity': 'I-LOC',
  'score': 0.937354,


In [93]:
# 요약, 모델이 처리할 수 있는 길이에 제약이 있어서 너무 길거나, 짧으면 안 됌, 150~1000 tokens
summ = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [94]:
text = """
A zettelkasten consists of many individual notes with ideas and other short
pieces of information that are taken down as they occur or are acquired. The
notes are numbered hierarchically, so that new notes may be inserted at the
appropriate place, and contain metadata to allow the note-taker to associate
notes with each other. For example, notes may contain tags that describe key
aspects of the note, and they may reference other notes. The numbering,
metadata, format and structure of the notes is subject to variation depending on
the specific method employed. Creating and using a zettelkasten is made easier
by taking the notes down digitally and using appropriate knowledge management
software. But it can be and has long been done on paper using index cards. The
method not only allows a researcher to store and retrieve information related to
their research, but also intends to enhance creativity. Cross-referencing notes
through tags allows the researcher to perceive connections and relationships
between individual items of information that may not be apparent in isolation.
These emergent aspects of the method make the zettelkasten somewhat similar to a
neural network with which one may "converse"
"""

In [95]:
summ(text)

[{'summary_text': ' A zettelkasten consists of many individual notes with ideas that are taken down as they occur or are acquired . The numbering, metadata, format and structure of the notes is subject to variation depending on the specific method employed . Cross-referencing notes with tags allows the researcher to perceive connections and relationships between notes that may not be apparent in isolation . The method is made easier by taking the notes down digitally and using knowledge management software .'}]

In [96]:
# zero-short classification = 전이학습 시, 데이터가 전혀 필요하지 않은 것 분류
# 미리 다양한 종류의 분류를 할 수 있도록 학습되어 있어, 분류할 때 추가 학습 없이 분류 이름을 이용해 텍스트 분류
zs = pipeline('zero-shot-classification')

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [97]:
#문장
sequence = 'Pizza is my favorite food'

In [98]:
#문장을 label에 분류, label에 분류하기 위해 따로 학습이 필요 없음
label = ['food', 'ocean', 'space']

In [99]:
zs(sequence, label)

{'sequence': 'Pizza is my favorite food',
 'labels': ['food', 'space', 'ocean'],
 'scores': [0.9971833825111389, 0.0015179921174421906, 0.0012986757792532444]}

### Decoding

#### Transformers 결정론적 디코딩

In [100]:
import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer

# 모형 로딩
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = TFAutoModelForCausalLM.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [101]:
# 문장의 시작 설정, 시작에 이어지는 문장 생성
input_ids = tokenizer.encode('I like this movie', return_tensors='tf')

In [102]:
# 탐욕 탐색
# 문장의 시작을 모델에 입력, 뒷 순서를 확률에 따라 예측; max_length를 통해 최대 길이 지정(현재 50 단어), 지정 안해줄 시 문장이 끝날 때까지
result = model.generate(input_ids, max_length=50)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [103]:
tokenizer.decode(result[0])
# 같은 단어의 반복이 현 디코딩 방법론의 문제점

"I like this movie. I like the way it's set up. I like the way it's set up. I like the way it's set up. I like the way it's set up. I like the way it's set up."

In [104]:
# 빔 탐색
# num_beams를 통해 빔의 개수 지정, early_stopping을 통해 후보군 모두가 문장이 끝났다면, 전체 길이에 도달하지 못해도 중단
result = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [105]:
tokenizer.decode(result[0])

"I like this movie. It's a lot of fun to watch. It's a lot of fun to watch. It's a lot of fun to watch. It's a lot of fun to watch. It's a lot of fun to watch."

In [106]:
# 2-gram이 반복되는 것을 억제
# 연속된 두 개의 표현이 반복되지 못하도록 후보군에서 강제로 삭제, 너무 size를 작게 잡으면 꼭 필요한 표현을 사용하지 못할 수도 있음
result = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [107]:
tokenizer.decode(result[0])

'I like this movie, but I don\'t think it\'s going to be as good as I thought it would be," he said.\n\n"I\'m not sure if it will be the best movie I\'ve ever seen. It\'s not going'

In [108]:
print(tokenizer.decode(result[0]))

I like this movie, but I don't think it's going to be as good as I thought it would be," he said.

"I'm not sure if it will be the best movie I've ever seen. It's not going


#### Transfomers 확률적 디코딩

In [109]:
import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = TFAutoModelForCausalLM.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [110]:
input_ids = tokenizer.encode('I like this movie', return_tensors='tf')

In [111]:
# 무작위 추출
# 실행할 때마다 달라지는 결과를 고정하기 위해 set_seed 를 통해 시드값 고정
# do_sample = True를 통해, 랜덤하게 추출하며, top_k = 0 을 통해 모든 후보 고려
tf.random.set_seed(0)
result = model.generate(input_ids, max_length=50, do_sample=True, top_k=0)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [112]:
tokenizer.decode(result[0])

"I like this movie and it's perfect. As happy as they are along with Billy Smith and Mark Ruffalo it still brings a lot of satisfaction to me. The boda is terribly wrong and I'm fine with that. The vamp two seems"

In [113]:
# 온도 조절
# temperature 를 통해 온도 조절 (기본 온도 1.0), 상대적으로 매끄러운 문장 생성
tf.random.set_seed(0)
result = model.generate(input_ids, max_length=50, do_sample=True, temperature=0.7, top_k=0)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [114]:
tokenizer.decode(result[0])

"I like this movie. The house is so cold that it looks like we're eating out. The shower is very cold and the wind is blowing. The best part, though, is that it's like you're in the middle of the movie."

In [115]:
# top-k
# 온도 설정 대신 top-k 설정
tf.random.set_seed(0)
result = model.generate(input_ids, max_length=50, do_sample=True, top_k=50)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [116]:
print(tokenizer.decode(result[0]))

I like this movie the most. This is like trying to figure out why your girlfriend did this to her, what she wanted, who she wasn't. I don't think it goes into all the details of what was done; the film does have


In [117]:
# top-p
# top-k 대신 top-p 설정
tf.random.set_seed(0)
result = model.generate(input_ids, max_length=50, do_sample=True, top_p=0.9, top_k=0)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [118]:
print(tokenizer.decode(result[0]))

I like this movie to have that alternative but this movie just brings it down in scale and drags down the plot at times. The music can be kind of unique, but I prefer to give the movie more credit to myself and the overall score for


##### Transformers 깁스 추출

In [119]:
import random
import tensorflow as tf
from transformers import TFAutoModelForMaskedLM, AutoTokenizer
from transformers import tf_top_k_top_p_filtering

In [120]:
# 모형 로딩
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = TFAutoModelForMaskedLM.from_pretrained('distilroberta-base')

All PyTorch model weights were used when initializing TFRobertaForMaskedLM.

All the weights of TFRobertaForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.


In [121]:
# 마스크 토큰
tokenizer.mask_token

'<mask>'

In [122]:
# 초기화
input_ids = tokenizer.encode('I like this movie <mask><mask><mask><mask><mask><mask>')

In [123]:
tokenizer.decode(input_ids) #앞 뒤의 <s>는 문장의 시작과 끝을 나타내는 기호; 문장 길이에 더해짐

'<s>I like this movie<mask><mask><mask><mask><mask><mask></s>'

In [124]:
# 길이 측정
SEQ_LEN = len(input_ids)
SEQ_LEN

12

In [125]:
# 무작위로 위치 하나를 골라 마스킹
i = random.randint(1, SEQ_LEN - 2) # 일정 범위 정수 랜덤 생성; 0번과 12번의 토큰을 제외(시작과 끝 기호 토큰)
input_ids[i] = tokenizer.mask_token_id # 랜덤하게 빈 칸을 만들고 예측 진행 준비

In [126]:
# 문장을 모형에 입력; 모델에 입력하기 위해 리스트를 tensor 형태로 변경
result = model(tf.convert_to_tensor([input_ids])) # 한 번에 모델에 여러 문장을 넣을 수 있기 때문에 리스트 형태로 감싸서 넣어줘야 함

In [127]:
# 로짓 확인
logits = result[0]

In [128]:
logits.shape # 1개의 문장, 12개의 토큰, 50265개의 토큰 확률

TensorShape([1, 12, 50265])

In [129]:
# 앞에서 마스킹한 위치의 로짓을 선택
logits = logits[:, i, :]

In [130]:
logits.shape # 한 개의 토큰에 대한 50265개의 확률

TensorShape([1, 50265])

In [131]:
# 로짓에 따라 무작위로 토큰을 고름
token_id = tf.random.categorical(logits, num_samples=1) # 로짓을 소프트맥스에 통과시켜 확률로 바꿔 랜덤하게 50265개 중 하나를 골라줌; num_samples = 1 한 개를 고름

In [132]:
# 고른 토큰의 번호를 마스킹된 위치에 대입
input_ids[i] = token_id.numpy()[0,0]

In [133]:
# 문장 확인
tokenizer.decode(input_ids)

'<s>I like this movie<mask> Cast<mask><mask><mask><mask></s>'

In [134]:
# 위 과정 20회 반복
for _ in range(20):
    i = random.randint(1, SEQ_LEN - 2)
    input_ids[i] = tokenizer.mask_token_id

    result = model(tf.convert_to_tensor([input_ids]))

    logits = result[0]
    logits = logits[:, i, :]

    token_id = tf.random.categorical(logits, num_samples=1)
    input_ids[i] = token_id.numpy()[0,0]
    print(tokenizer.decode(input_ids))

<s>I like this movie<mask> Use<mask><mask><mask><mask></s>
<s>I like this movie<mask> Use<mask><mask><mask><mask></s>
<s>I like this movie<mask> Use Them<mask><mask><mask></s>
<s>I like silly movie<mask> Use Them<mask><mask><mask></s>


<s>Looks like silly movie<mask> Use Them<mask><mask><mask></s>
<s>Looks like silly movie<mask> Use Them<mask> �<mask></s>
<s>Looks like Matrix movie<mask> Use Them<mask> �<mask></s>
<s>Looks like Matrix movie<mask> Use TM<mask> �<mask></s>
<s>Looks like Matrix movie<mask> Use Jackson<mask> �<mask></s>
<s>Looks like Matrix movie actors Use Jackson<mask> �<mask></s>
<s>Looks like Matrix movie actors Use Jackson<mask> �<mask></s>
<s>Looks like Matrix movie actors Use Jackson Axe �<mask></s>
<s>Looks like Matrix movie actors Use Jackson Wiki �<mask></s>
<s>Looks like Matrix Singer actors Use Jackson Wiki �<mask></s>
<s>Looks like Matrix Singer actors Use Jackson Wiki �<mask></s>
<s>Looks like Matrix Singer actors Use Jackson Wiki 😉</s>
<s>Looks like Matrix Singer actors Use Matrix Wiki 😉</s>
<s>Looks like Matrix Singer actors Use Matrix Wiki 😉</s>
<s>Looks like Matrix robot actors Use Matrix Wiki 😉</s>
<s>Looks like Matrix robot actors Use Matrix Wiki 😉</s>


In [135]:
# top-k를 적용하여 20회 반복
input_ids = tokenizer.encode('I like this movie <mask><mask><mask><mask><mask><mask>')
for _ in range(20):
    i = random.randint(1, SEQ_LEN - 2)
    input_ids[i] = tokenizer.mask_token_id

    result = model(tf.convert_to_tensor([input_ids]))

    logits = result[0]
    logits = logits[:, i, :]
    logits = tf_top_k_top_p_filtering(logits, top_k=50) # top-k

    token_id = tf.random.categorical(logits, num_samples=1)
    input_ids[i] = token_id.numpy()[0,0]
    print(tokenizer.decode(input_ids))

<s>I like this movie<mask><mask><mask><mask><mask>?!</s>
<s>I like this pumpkin<mask><mask><mask><mask><mask>?!</s>
<s>I like this pumpkin<mask><mask><mask><mask><mask>....</s>
<s>I like this pumpkin<mask><mask> pumpkin<mask><mask>....</s>
<s>I like coconut pumpkin<mask><mask> pumpkin<mask><mask>....</s>
<s>I like coconut pumpkin<mask> fried pumpkin<mask><mask>....</s>
<s>I like coconut pumpkin<mask> fried pumpkin<mask> soup....</s>
<s>I like coconut pumpkin<mask> fried pumpkin<mask> stew....</s>
<s>I love coconut pumpkin<mask> fried pumpkin<mask> stew....</s>
<s>I enjoy coconut pumpkin<mask> fried pumpkin<mask> stew....</s>
<s>I enjoy coconut pumpkin spice fried pumpkin<mask> stew....</s>
<s>Also enjoy coconut pumpkin spice fried pumpkin<mask> stew....</s>
<s>Also enjoy coconut curry spice fried pumpkin<mask> stew....</s>
<s>Also enjoy coconut curry spice plus pumpkin<mask> stew....</s>
<s>Also enjoy coconut curry soup plus pumpkin<mask> stew....</s>
<s>Also enjoy coconut curry soup p

In [136]:
# top-p를 적용하여 20회 반복
input_ids = tokenizer.encode('I like this movie <mask><mask><mask><mask><mask><mask>')
for _ in range(20):
    i = random.randint(1, SEQ_LEN - 2)
    input_ids[i] = tokenizer.mask_token_id

    result = model(tf.convert_to_tensor([input_ids]))

    logits = result[0]
    logits = logits[:, i, :]
    logits = tf_top_k_top_p_filtering(logits, top_p=0.9) # top-p

    token_id = tf.random.categorical(logits, num_samples=1)
    input_ids[i] = token_id.numpy()[0,0]
    print(tokenizer.decode(input_ids))

<s>I like this movie<mask><mask><mask><mask>VIEW<mask></s>
<s>I like this �<mask><mask><mask><mask>VIEW<mask></s>
<s>I like this BL<mask><mask><mask><mask>VIEW<mask></s>
<s>I like this BL<mask><mask><mask><mask> OUT<mask></s>
<s>I like this BL<mask><mask> FIN<mask> OUT<mask></s>
<s>I like this BLOWN<mask> FIN<mask> OUT<mask></s>
<s>I like this BLOWN<mask> SEE<mask> OUT<mask></s>
<s>I like this GROWN<mask> SEE<mask> OUT<mask></s>
<s>I like this GROWN<mask> SEE<mask> OUTAGE</s>
<s>I like this GROWN<mask> SEE<mask> IMAGE</s>
<s>I like this GROWN<mask> SEE<mask> IMAGE</s>
<s>I like this GRANT<mask> SEE<mask> IMAGE</s>
<s>I like MORE GRANT<mask> SEE<mask> IMAGE</s>
<s>I like MORE GRANT<mask> SEE THE IMAGE</s>
<s>I like MORE GRANT INFORMATION SEE THE IMAGE</s>
<s>I like MORE GRANT INFORMATION SEE THIS IMAGE</s>
<s>I like MORE GRANT TO SEE THIS IMAGE</s>
<s>I like MORE GRANT NEWS SEE THIS IMAGE</s>
<s>I like MORE GRANT HELP SEE THIS IMAGE</s>
<s>I like MORE GRANT NEWS SEE THIS IMAGE</s>
