In [7]:
import re
import os

def clean_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        book_text = file.read()
    
    cleaned_text = re.sub(r'\n+', ' ', book_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    filename_only = os.path.basename(filepath)
    cleaned_filename = "cleaned_" + filename_only

    print(cleaned_filename, len(cleaned_text), "characters")

    folder = os.path.dirname(filepath)
    cleaned_path = os.path.join(folder, cleaned_filename)

    with open(cleaned_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

folder_path = "/Users/hyun/dev_ws/harry_potter/data/"

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    clean_text(file_path)

cleaned_07 Harry Potter and the Deathly Hallows.txt 1133063 characters
cleaned_05 Harry Potter and the Order of the Phoenix.txt 1489734 characters
cleaned_02 Harry Potter and the Chamber of Secrets.txt 488771 characters
cleaned_06 Harry Potter and the Half-Blood Prince.txt 982041 characters
cleaned_03 Harry Potter and the Prisoner of Azkaban.txt 621137 characters
cleaned_01 Harry Potter and the Sorcerers Stone.txt 436000 characters
cleaned_04 Harry Potter and the Goblet of Fire.txt 1093670 characters


In [17]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = "Winter is coming!"

tokens = tokenizer.encode(text)

print("글자수:", len(text), "토큰수", len(tokens))
print(tokens)
print(tokenizer.decode(tokens))
for t in tokens:
    print(f"{t}\t -> {tokenizer.decode([t])}")

글자수: 17 토큰수 4
[35376, 318, 2406, 0]
Winter is coming!
35376	 -> Winter
318	 ->  is
2406	 ->  coming
0	 -> !


In [16]:
from transformers import AutoTokenizer # pip install transformers

tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct")  # KoGPT2 사용
# tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2")  # KoGPT2 사용

print("Vocab size :", len(tokenizer))

text = "이웃에 방해가 되지 않는 선에서"

tokens = tokenizer.encode(text)

print(len(text), len(tokens))
print(tokens)
print(tokenizer.decode(tokens))

Vocab size : 102400
17 10
[22247, 2373, 14436, 905, 970, 698, 1145, 657, 1302, 41728]
이웃에 방해가 되지 않는 선에서


In [18]:
for char in text:
    token_ids = tokenizer.encode(char)
    decoded = tokenizer.decode(token_ids)
    print(f"{char} -> {token_ids} -> {decoded}")

W -> [54] -> W
i -> [72] -> i
n -> [77] -> n
t -> [83] -> t
e -> [68] -> e
r -> [81] -> r
  -> [220] ->  
i -> [72] -> i
s -> [82] -> s
  -> [220] ->  
c -> [66] -> c
o -> [78] -> o
m -> [76] -> m
i -> [72] -> i
n -> [77] -> n
g -> [70] -> g
! -> [0] -> !


In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, txt, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # token_ids = tokenizer.encode("<|endoftext|>" + txt, allowed_special={"<|endoftext|>"})
        token_ids = tokenizer.encode(txt)

        print("# of tokens in txt:", len(token_ids))

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# with open("cleaned_한글문서.txt", 'r', encoding='utf-8-sig') as file: # 선택: -sig를 붙여서 BOM 제거
with open("../data/cleaned_02 Harry Potter and the Chamber of Secrets.txt", 'r', encoding='utf-8-sig') as file: # 선택: -sig를 붙여서 BOM 제거
    txt = file.read()

dataset = MyDataset(txt, max_length = 32, stride = 4)

train_loader = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True)

# of tokens in txt: 130520


In [29]:
dataiter = iter(train_loader)

x, y = next(dataiter)

print(tokenizer.decode(x[0].tolist()))
print(tokenizer.decode(y[0].tolist()))

�s time for you to step aside. This is an Order of Suspension — you’ll find all twelve signatures on it. I’m
s time for you to step aside. This is an Order of Suspension — you’ll find all twelve signatures on it. I’m afraid


In [30]:
VOCAB_SIZE = tokenizer.n_vocab # 50257 Tiktoken
#VOCAB_SIZE = len(tokenizer) # AutoTokenizer
CONTEXT_LENGTH = 128  # Shortened context length (orig: 1024)
EMB_DIM = 768  # Embedding dimension
NUM_HEADS = 12  # Number of attention heads
NUM_LAYERS = 12  # Number of layers
DROP_RATE = 0.1  # Dropout rate
QKV_BIAS = False  # Query-key-value bias

In [31]:
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        
        assert d_out % NUM_HEADS == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.head_dim = d_out // NUM_HEADS

        self.W_query = nn.Linear(d_in, d_out, bias=QKV_BIAS)
        self.W_key = nn.Linear(d_in, d_out, bias=QKV_BIAS)
        self.W_value = nn.Linear(d_in, d_out, bias=QKV_BIAS)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(DROP_RATE)
        self.register_buffer('mask', torch.triu(torch.ones(CONTEXT_LENGTH, CONTEXT_LENGTH), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, NUM_HEADS, self.head_dim)
        values = values.view(b, num_tokens, NUM_HEADS, self.head_dim)
        queries = queries.view(b, num_tokens, NUM_HEADS, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)

        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(EMB_DIM, 4 * EMB_DIM),
            GELU(),
            nn.Linear(4 * EMB_DIM, EMB_DIM),
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=EMB_DIM,
            d_out=EMB_DIM)
    
        self.ff = FeedForward()
        self.norm1 = LayerNorm(EMB_DIM)
        self.norm2 = LayerNorm(EMB_DIM)
        self.drop_shortcut = nn.Dropout(DROP_RATE)

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x


class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.tok_emb = nn.Embedding(VOCAB_SIZE, EMB_DIM)
        self.pos_emb = nn.Embedding(CONTEXT_LENGTH, EMB_DIM)
        self.drop_emb = nn.Dropout(DROP_RATE)

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock() for _ in range(NUM_LAYERS)])

        self.final_norm = LayerNorm(EMB_DIM)
        self.out_head = nn.Linear(EMB_DIM, VOCAB_SIZE, bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [32]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print("Using device:", device)

torch.manual_seed(123)
model = GPTModel()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

Using device: mps


In [34]:
tokens_seen, global_step = 0, -1

losses = []

for epoch in range(40):
    model.train()  # Set model to training mode
    
    epoch_loss = 0
    for input_batch, target_batch in train_loader:
        optimizer.zero_grad() # Reset loss gradients from previous batch iteration
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)

        logits = model(input_batch)
        loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
        epoch_loss += loss.item()
        loss.backward() # Calculate loss gradients
        optimizer.step() # Update model weights using loss gradients
        tokens_seen += input_batch.numel()
        global_step += 1

        if global_step % 1000 == 0:
            print(f"Tokens seen: {tokens_seen}")
        # Optional evaluation step

    avg_loss = epoch_loss / len(train_loader)
    losses.append(avg_loss)
    print(f"Epoch: {epoch + 1}, Loss: {avg_loss}")
    torch.save(model.state_dict(), "model_" + str(epoch + 1).zfill(3) + ".pth")

Tokens seen: 4096
Epoch: 1, Loss: 0.3644899288265724
Epoch: 2, Loss: 0.29445035603102737
Epoch: 3, Loss: 0.26637565192040497
Tokens seen: 4100096
Epoch: 4, Loss: 0.2508515864024012
Epoch: 5, Loss: 0.24162740882222108
Epoch: 6, Loss: 0.23564565621727096
Epoch: 7, Loss: 0.23038860240320522
Tokens seen: 8196096
Epoch: 8, Loss: 0.2255825633843114
Epoch: 9, Loss: 0.22118714557388636
Epoch: 10, Loss: 0.2186544587997001
Epoch: 11, Loss: 0.2148707326002947
Tokens seen: 12292096
Epoch: 12, Loss: 0.21408080432828017
Epoch: 13, Loss: 0.21007435616311126
Epoch: 14, Loss: 0.2069803169510496
Epoch: 15, Loss: 0.20501594984625268
Tokens seen: 16388096
Epoch: 16, Loss: 0.2030314862376123
Epoch: 17, Loss: 0.2020940687947386
Epoch: 18, Loss: 0.2005700552322733
Epoch: 19, Loss: 0.19778033668600667
Tokens seen: 20484096
Epoch: 20, Loss: 0.1960999135543981
Epoch: 21, Loss: 0.19552064475816067
Epoch: 22, Loss: 0.1935697317592741
Epoch: 23, Loss: 0.1923243437461027
Tokens seen: 24580096
Epoch: 24, Loss: 0.191

In [35]:
# 파일로 저장했던 네트워크의 가중치들 읽어들이기
model.load_state_dict(torch.load("model_014.pth", map_location=device, weights_only=True))
model.eval() # dropout을 사용하지 않음

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(128, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [36]:
idx = tokenizer.encode("Dobby is") # 토큰 id의 list
idx = torch.tensor(idx).unsqueeze(0).to(device)

with torch.no_grad():
    logits = model(idx)

logits = logits[:, -1, :]

# 가장 확률이 높은 단어 10개 출력
top_logits, top_indices = torch.topk(logits, 10) 
for p, i in zip(top_logits.squeeze(0).tolist(), top_indices.squeeze(0).tolist()):
    print(f"{p:.2f}\t {i}\t {tokenizer.decode([i])}")

# 가장 확률이 높은 단어 출력
idx_next = torch.argmax(logits, dim=-1, keepdim=True)
flat = idx_next.squeeze(0) # 배치 차원 제거 torch.Size([1])
out = tokenizer.decode(flat.tolist()) # 텐서를 리스트로 바꿔서 디코드

13.04	 257	  a
12.34	 973	  used
11.30	 991	  still
10.26	 4084	  clearly
9.36	 1479	  free
9.21	 1464	  always
8.99	 4762	  believed
8.94	 4978	  caught
8.57	 284	  to
8.53	 1908	  sent


In [37]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:
            break

        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [44]:
start_context = input("Start context: ")

# idx = tokenizer.encode(start_context, allowed_special={'<|endoftext|>'})
idx = tokenizer.encode(start_context)
idx = torch.tensor(idx).unsqueeze(0)

context_size = model.pos_emb.weight.shape[0] 

for i in range(10):

    token_ids = generate(
        model=model,
        idx=idx.to(device),
        max_new_tokens=50,
        context_size= context_size,
        top_k=50,
        temperature=0.5
    )

    flat = token_ids.squeeze(0) # remove batch dimension
    out = tokenizer.decode(flat.tolist()).replace("\n", " ")

    print(i, ":", out)

0 : Harry fights and sealed before Harry could stop him, screamed, “ATTACK! ATTACK! ANOTHER ATTACK! NO MORTAL OR GHOST IS SAFE!” The ghoul in the note telling them, which was creeping along the
1 : Harry fights and chasing the gateway and Dobby had darted to the bedroom door, pulled it open, and sprinted down the stairs. Mouth dry, stomach lurching, stomach lurching, stomach lurchrolling his stomach lurching, stomach
2 : Harry fights. He was difficult: She was a lot bigger than he was. “Dear, dear,” said Lockhart, skittering through the crowd, looking at the crowd, “but she merely came to the crowd backed away,
3 : Harry fights and sealed before Harry could stop him, screamed, “ATTACK! ATTACK! ANOTHER ATTACK! NO MORTAL OR GHOST IS SAFE!” There was creeping to the snake skin and splashed onto the note telling
4 : Harry fights and modest,” said Mr. Weasley, nodding coldly. “Busy time at the Ministry, I hear,” said Mr. “ Mr. “All right words. “All right outside your dad”
5 : Harry fights 