# 기본환경 설정

In [None]:
from google.colab import userdata
import huggingface_hub

In [None]:
HF_KEY = userdata.get("HF_KEY")
huggingface_hub.login(HF_KEY)

# 데이터 준비

In [None]:
import os
import sentencepiece as spm
from datasets import load_dataset

In [None]:
os.makedirs("spm_models", exist_ok=True)

## 영어

In [None]:
dataset_eng = load_dataset("ag_news", split="train[:100]")

In [None]:
sentences_eng = dataset_eng["text"]
sentences_eng[:2]

In [None]:
with open("corpus_eng.txt", "w", encoding="utf-8") as f:
    for line in sentences_eng:
        f.write(line + "\n")

## 한국어

In [None]:
dataset_kor = load_dataset("daekeun-ml/naver-news-summarization-ko", split="train[:100]")

In [None]:
sentences_kor = dataset_kor["document"]
sentences_kor[:2]

In [None]:
with open("corpus_kor.txt", "w", encoding="utf-8") as f:
    for line in sentences_kor:
        f.write(line + "\n")

# SentencePiece 모델 학습

## 영어

In [None]:
spm.SentencePieceTrainer.train(
    input="corpus_eng.txt",
    model_prefix="spm_models/spm_eng",
    vocab_size=1000,
    model_type="bpe" # bpe, unigram, char 등 가능
)

## 한국어

In [None]:
spm.SentencePieceTrainer.train(
    input="corpus_kor.txt",
    model_prefix="spm_models/spm_kor",
    vocab_size=1000,
    model_type="bpe" # bpe, unigram, char 등 가능
)

# SentencePiece 모델 로드 & 토큰화

In [None]:
tokenizer_eng = spm.SentencePieceProcessor(model_file="spm_models/spm_eng.model")
tokenizer_kor = spm.SentencePieceProcessor(model_file="spm_models/spm_kor.model")

In [None]:
# 예제 문장
example_eng = "I love natural language processing."
example_kor = "저는 자연어 처리를 공부하고 있습니다."

In [None]:
tokens_eng = tokenizer_eng.encode(example_eng, out_type=str)
tokens_kor = tokenizer_kor.encode(example_kor, out_type=str)

In [None]:
print(f"영어 토큰: {tokens_eng}\n한글 토큰: {tokens_kor}")

# 인덱스 변환 및 임베딩 : PyTorch 예시

In [None]:
import torch
from torch import nn

In [None]:
# 문자를 토큰 인덱스로 변환
ids_eng = torch.LongTensor([tokenizer_eng.encode(example_eng)])
ids_kor = torch.LongTensor([tokenizer_kor.encode(example_kor)])

In [None]:
# 임베딩 설정
emb_dim = 16
embedding_eng = nn.Embedding(tokenizer_eng.get_piece_size(), emb_dim)
embedding_kor = nn.Embedding(tokenizer_kor.get_piece_size(), emb_dim)

In [None]:
embedding_eng, embedding_kor

In [None]:
# 임베딩 벡터 생성
embeds_eng = embedding_eng(ids_eng)
embeds_kor = embedding_eng(ids_kor)

In [None]:
embeds_eng.shape, embeds_kor.shape

In [None]:
ids_eng

In [None]:
embeds_eng

# 토큰화 방법 비교

In [None]:
# 1) 코퍼스 파일 만들기
corpus = """I love natural language processing.
I love machine learning.
ChatGPT is amazing.
Natural language models are powerful."""
with open("corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)

In [None]:
# 2) unigram & bpe 모델 학습 (vocab_size=30, character_coverage=1.0)
for mtype in ["unigram", "bpe"]:
    spm.SentencePieceTrainer.train(
        input="corpus.txt",
        model_prefix=f"spm_models/spm_{mtype}",
        vocab_size=40,
        model_type=mtype,
        character_coverage=1.0
    )

In [None]:
# 3) 학습된 모델 로드
sp_unigram = spm.SentencePieceProcessor(model_file="spm_models/spm_unigram.model")
sp_bpe     = spm.SentencePieceProcessor(model_file="spm_models/spm_bpe.model")

In [None]:
# 4) 샘플 문장 토큰화
sample = "I love natural language processing."
tokens_unigram = sp_unigram.encode(sample, out_type=str)
tokens_bpe     = sp_bpe.encode(sample,     out_type=str)

In [None]:
# 5) char & word 방식은 수동 분할
tokens_char = [c if c != " " else "▁" for c in sample]
tokens_word = sample.split(" ")

In [None]:
# 6) 결과 출력
print("unigram:", tokens_unigram, "→", len(tokens_unigram), "tokens")
print("bpe    :", tokens_bpe,     "→", len(tokens_bpe),     "tokens")
print("char   :", tokens_char,    "→", len(tokens_char),    "tokens")
print("word   :", tokens_word,    "→", len(tokens_word),    "tokens")