# 1.0 Env

In [None]:
import os
import re
import json
import numpy as np

import torch
import torch.nn.functional as F

In [None]:
# Gradient False
torch.set_grad_enabled(True)
# work dir
work_dir = '/home/ubuntu/nlp-practice'

In [None]:
%cd {work_dir}
!pwd

# 1.2 DL Quick Review

## What is Tensor?

In [None]:
# vector
torch.rand(3)

In [None]:
# matrix
torch.rand(2, 3)

In [None]:
# 3D tensor
torch.rand(3, 2, 3)

In [None]:
# 4D tensor
torch.rand(3, 3, 2, 3)

In [None]:
# 5D tensor
torch.rand(3, 3, 3, 2, 3)

In [None]:
# 6D tensor
torch.rand(3, 3, 3, 3, 2, 3)

In [None]:
# 3D tensor
torch.rand(3, 2, 3).shape

In [None]:
# 2D matrix
torch.rand(2, 3).shape

In [None]:
# Tabular Dataset
torch.rand(2, 3)[0].shape

In [None]:
# Mini-batch
torch.rand(10, 3)[:2].shape

## 손실 함수와 신경망 학습

In [None]:
y_true = torch.rand(10)
y_true

In [None]:
y_pred = torch.rand(10)
y_pred

In [None]:
# L2 Distance
(y_true - y_pred).pow(2).sum().sqrt()

In [None]:
# L2 norm
(y_true - y_pred).norm()

In [None]:
# RMSE
F.mse_loss(y_true, y_pred).sqrt()

In [None]:
# MSE
F.mse_loss(y_true, y_pred)

In [None]:
# MSE
(y_true - y_pred).pow(2).mean()

## Softmax & Cross Entropy

In [None]:
# label
y_label = torch.tensor([2, 0, 1, 0])

In [None]:
# logits
logits = torch.randn(4, 3)
logits

In [None]:
# softmax 직접계산
y_pred = logits.exp() / logits.exp().sum(keepdim=True, dim=-1)
y_pred

In [None]:
# softmax 함수호출
y_pred = F.softmax(logits, dim=-1)
y_pred

In [None]:
# y_true (one-hot)
y_true = torch.eye(3)[y_label]
y_true

In [None]:
# Cross Entropy Loss (직접 계산)
-(y_true * y_pred.log()).sum(dim=-1).mean()

In [None]:
# Cross Entropy Loss (함수 사용, *logits 입력)
F.cross_entropy(logits, y_true)

## Maximum Likelihood Estimation: Equations

In [None]:
y_pred

In [None]:
# y_i (정답을 예측 확률)
y_prob = (y_true * y_pred).sum(dim=-1)
y_prob

In [None]:
# negative log likelihood
-y_prob.log().sum()

In [None]:
# negative log likelihood (mean)
-y_prob.log().mean()

In [None]:
# Cross Entropy Loss
F.cross_entropy(logits, y_true)

# 1.3 Preprocessing

In [None]:
%cd {work_dir}/src/kowiki
!pwd

## KoWiki Crawing

In [None]:
os.makedirs("data", exist_ok=True)

In [None]:
# wiki download
!wget https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-meta-current.xml.bz2 \
   -O ../../data/kowiki-latest-pages-meta-current.xml.bz2

In [None]:
# run src/kowiki/extract_kowiki.sh
# !sh ./extract_kowiki.sh

In [None]:
with open('../../data/kowiki/AA/wiki_00') as f:
    for line in f:
        wiki = json.loads(line)['text']
        break
print(wiki)

## Regular Expression

### Phone Number

In [None]:
texts = [
    "010-1234-5678",
    "01012345678",
    "010 1234 5678",
    "010)1234-5678",
    "010.1234.5678",
    "011-1234-5678",
    "01112345678",
    "011-123-4567",
    "042-123-4567",
    "02-1234-5678",
    "821012345678",
    "+82-10-1234-5678",
]

In [None]:
pattern = r"\+?(82)?-?0?[0-9]{1,2}(.|-)?[0-9]{3,4}(.|-)?[0-9]{4}"

In [None]:
cnt = 0
for t in texts:
    if re.match(pattern, t) is not None:
        cnt += 1
    else:
        print(t)

print(f"{cnt}/{len(texts)}")

### Replacing

In [None]:
text = "My phone number is 010-1234-5678"

In [None]:
re.sub(pattern, "PHONE_NUMBER", text)

### Convert File

In [None]:
with open('../../data/kowiki/AA/wiki_00') as f:
    for line in f:
        wiki = json.loads(line)['text']
        break
with open('../../data/kowiki/wiki_dump.txt', 'w') as f:
    f.write(wiki)

In [None]:
def convert_file(input_fn, pattern, text):
    with open(input_fn) as f:
        for line in f:
            line = re.sub(pattern, text, line)
            print(line.strip())

In [None]:
# number to *
convert_file('../../data/kowiki/wiki_dump.txt', r'[0-9]', '*')

In [None]:
# 한글이 아닌 글자 *
convert_file('../../data/kowiki/wiki_dump.txt', r'[^가-힣 ]', '*')

In [None]:
# 한글 *
convert_file('../../data/kowiki/wiki_dump.txt', r'[가-힣]', '*')

## Make Wiki Dump

In [None]:
# run src/kowiki/make_dump.sh
# !sh ./make_dump.sh

In [None]:
with open('../../data/kowiki/wiki_dump.txt') as f:
    for i, line in enumerate(f):
        print(line.strip())
        if i > 100:
            break

## Mecab

In [None]:
from mecab import MeCab

In [None]:
mecab = MeCab()

In [None]:
mecab.pos('아버지가 방에 들어가신다.')

In [None]:
mecab.pos('아버지 가방에 들어가신다.')

In [None]:
mecab.morphs('아버지가방에들어가신다.')

In [None]:
with open('../../data/kowiki/wiki_dump.txt') as f:
    for i, line in enumerate(f):
        print(mecab.morphs(line.strip()))
        if i > 100:
            break

## BBPE

In [None]:
# run src/kowiki/tokenizer_train.sh
# !sh ./tokenizer_train.sh

In [None]:
from transformers import T5TokenizerFast, AutoTokenizer

In [None]:
# Tokenizer load
tokenizer = T5TokenizerFast.from_pretrained("../../data/kowiki_32k")

In [None]:
ko_sentence = "<s>이것은 테스트 문장입니다. <unused_0>어떻게 보이나요?<unused_1> 고유명사 \"파이썬 파이토치 허깅페이스\"는 어떻게 되나요?</s>"
en_sentence = "<s>This is a test sentence. <unused_0>How does it look?<unused_1> Proper nouns \"Python PyTorch HuggingFace\" how does it go?</s>"

In [None]:
_ids = tokenizer.encode(ko_sentence)
print(_ids)
tokens = tokenizer.tokenize(ko_sentence)
print(tokens)

In [None]:
# skip_special_tokens = True
tokenizer.decode(_ids, skip_special_tokens=True)

In [None]:
# skip_special_tokens = False
tokenizer.decode(_ids, skip_special_tokens=False)

In [None]:
_ids = tokenizer.encode(ko_sentence)
print(_ids)
tokens = tokenizer.tokenize(ko_sentence)
print(tokens)

In [None]:
tokenizer.decode(_ids)

In [None]:
# load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [None]:
_ids = tokenizer.encode(ko_sentence)
print(_ids)
tokens = tokenizer.tokenize(ko_sentence)
print(tokens)

In [None]:
tokenizer.decode(_ids)

## One-hot 인코딩

In [None]:
words = [
    "가위", "공책", "교과서", "노트", "딱풀",
    "볼펜", "색연필", "샤프", "싸인펜", "연필",
    "자", "지우개", "책상", "칼", "필기장",
    "필통"
]

In [None]:
word2id = {word:i for i, word in enumerate(words)}
word2id

In [None]:
ids = [word2id[word] for word in ['공책', '노트', '지우개']]
ids

In [None]:
torch.nn.functional.one_hot(
    torch.from_numpy(np.array(ids)),
    num_classes=len(word2id)
)

In [None]:
# Tokenizer load
tokenizer = T5TokenizerFast.from_pretrained("../../data/kowiki_32k")

In [None]:
ids = tokenizer.encode('공책 노트 지우개')
ids

In [None]:
tokenizer.decode(ids)

In [None]:
torch.nn.functional.one_hot(
    torch.from_numpy(np.array(ids)),
    num_classes=len(tokenizer)
).shape

In [None]:
# tokenizer 비교
# https://huggingface.co/openai-community/gpt2/tree/main
# https://huggingface.co/google/gemma-2b/tree/main

## 미니배치 만들기

In [None]:
sentences = []
with open('../../data/kowiki/wiki_dump.txt') as f:
    for line in f:
        line = line.strip()
        if line:
            sentences.append(line)
        if len(sentences) >= 10:
            break
sentences

In [None]:
tensors = []
for line in sentences:
    tensors.append(torch.from_numpy(np.array(tokenizer.encode(line))))
tensors

In [None]:
mini_batch = torch.nn.utils.rnn.pad_sequence(
    tensors,
    batch_first=True,
    padding_value=tokenizer.pad_token_id
)
print(mini_batch.shape)
mini_batch

# 1.4 Word Embedding

In [None]:
%cd {work_dir}/src/kowiki
!pwd

## Word2Vec (gensim)

In [None]:
import gensim
import gensim.downloader as api

In [None]:
# 이미 학습된 model download
# wv = api.load('word2vec-google-news-300') # 1.6G
wv = api.load('glove-wiki-gigaword-100') # 128M

In [None]:
# vocab 개수 및 최초 20개 출력
print(f"len: {len(wv.index_to_key)}")
for i, word in enumerate(wv.index_to_key):
    if i >= 20:
        break
    print(f"{i:2d}: {word}")

In [None]:
wv.most_similar('obama')

In [None]:
wv.most_similar('banana')

In [None]:
wv.most_similar('apple')

In [None]:
# p1 - n1 + p2
def analogy(p1, n1, p2):
    result = wv.most_similar(positive=[p2, p1], negative=[n1])
    return result

In [None]:
# king - man + woman
analogy('king', 'man', 'woman')

In [None]:
# japanese - japan + australia
analogy('japanese', 'japan', 'australia')

In [None]:
# beer - australia + france
analogy('beer', 'australia', 'france')

In [None]:
# clinton - reagan + obama
analogy('clinton', 'reagan', 'obama')

In [None]:
# tallest - tall + long
analogy('tallest', 'tall', 'long')

In [None]:
# fantastic - good + bad
analogy('fantastic', 'good', 'bad')

In [None]:
wv.doesnt_match("breakfast cereal dinner lunch".split())

## 임베딩 레이어

In [None]:
# Tokenizer load
tokenizer = T5TokenizerFast.from_pretrained("../../data/kowiki_32k")

In [None]:
embedding = torch.nn.Embedding(
    len(tokenizer),
    4,
    padding_idx=tokenizer.pad_token_id)
embedding

In [None]:
embedding.weight

In [None]:
ids = tokenizer.encode('지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다.')
ids.append(tokenizer.pad_token_id)
print(ids)

In [None]:
onehot = torch.nn.functional.one_hot(
    torch.from_numpy(np.array(ids)),
    num_classes=len(tokenizer)
)
onehot = onehot.float()
onehot.shape

In [None]:
# (15, 32100) X (32100, 4) = (15, 4)
torch.matmul(onehot, embedding.weight)

In [None]:
# (15,) = (15, 4)
embedding(torch.from_numpy(np.array(ids)))