In [1]:
!pip install torch



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import glob
import os
import string
import random
import time
import math

In [3]:
# 1.1. 기본 변수 설정
ALL_LETTERS = string.ascii_letters + " .,;'" # 모든 사용 가능 문자
N_LETTERS = len(ALL_LETTERS)

# 1.2. 데이터 경로 찾기 (data/names/*.txt)
data_path = './name_data/*.txt'

# 1.3. 데이터 로드 및 전처리
category_lines = {} # { 'Korean': ['Kim', 'Lee', ...], 'English': ['Smith', ...] }
all_categories = [] # ['Korean', 'English', ...]

In [4]:
def find_files(path):
  return glob.glob(path)

In [5]:
# 파일에서 이름 읽기
def read_lines(filename):
  try:
    with open(filename, encoding='utf-8') as f:
      lines = f.read().strip().split('\n')
    return lines
  except UnicodeDecodeError:
    print(f"Skipping {filename} due to encoding issue.")
    return []

In [6]:

# 데이터 로딩 실행
for filename in find_files(data_path):
  category = os.path.splitext(os.path.basename(filename))[0]
  all_categories.append(category)
  lines = read_lines(filename)
  category_lines[category] = lines

N_CATEGORIES = len(all_categories)
if N_CATEGORIES == 0:
  print("!!! 데이터 파일을 찾을 수 없습니다. 'data/names/' 경로를 확인하세요.")
  exit()

print(f"총 {N_CATEGORIES}개의 국적 로드 완료. {all_categories}")


총 18개의 국적 로드 완료. ['Czech', 'Italian', 'Vietnamese', 'English', 'Greek', 'Korean', 'Irish', 'Scottish', 'Arabic', 'Portuguese', 'Russian', 'Chinese', 'Polish', 'Japanese', 'Spanish', 'French', 'German', 'Dutch']


In [7]:
# --- 2. 헬퍼 함수 (이름 -> 텐서) ---

# 문자를 인덱스로 변환 (예: 'a' -> 0)
def letter_to_index(letter):
  return ALL_LETTERS.find(letter)

# 이름을 [sequence_length, 1, n_letters] 크기의
# "원-핫(One-Hot) 텐서"로 변환합니다.
def name_to_tensor(name):
  tensor = torch.zeros(len(name), 1, N_LETTERS)
  for li, letter in enumerate(name):
    index = letter_to_index(letter)
    if index != -1: # ALL_LETTERS에 있는 문자만 처리
      tensor[li][0][index] = 1
  return tensor

In [8]:

# 국적(카테고리)을 텐서(인덱스)로 변환
def category_to_tensor(category):
  li = all_categories.index(category)
  return torch.tensor([li], dtype=torch.long)

# 학습을 위한 랜덤 샘플 생성기
def random_training_example():
  category = random.choice(all_categories)
  name = random.choice(category_lines[category])
  category_tensor = category_to_tensor(category)
  name_tensor = name_to_tensor(name)
  return category, name, category_tensor, name_tensor

In [9]:

# --- 3. LSTM 모델 정의 ---

class LSTMClassifier(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(LSTMClassifier, self).__init__()
    self.hidden_size = hidden_size

    # input_size = N_LETTERS (원-핫 벡터 크기)
    # hidden_size = LSTM의 은닉 상태 크기
    self.lstm = nn.LSTM(input_size, hidden_size)

    # LSTM의 최종 은닉 상태를 받아 N_CATEGORIES (국적 수)로 출력
    self.hidden_to_output = nn.Linear(hidden_size, output_size)

    # NLLLoss를 사용하기 위해 LogSoftmax 추가
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, input_tensor, hidden, cell):
    # input_tensor: [seq_len, 1, n_letters]
    # hidden, cell: [1, 1, hidden_size]

    # LSTM은 (output, (next_hidden, next_cell))을 반환
    output, (hidden, cell) = self.lstm(input_tensor, (hidden, cell))

    # 우리는 시퀀스의 '마지막' 출력만 관심 있습니다.
    # output[-1]은 마지막 타임스텝의 LSTM 출력
    output = self.hidden_to_output(output[-1])
    output = self.softmax(output)
    return output, hidden, cell

  def init_hidden_cell(self):
    # LSTM은 RNN과 달리 은닉 상태(hidden)와 셀 상태(cell) 2개가 필요합니다.
    return (torch.zeros(1, 1, self.hidden_size),
            torch.zeros(1, 1, self.hidden_size))

In [10]:
# 모델 인스턴스 생성
N_HIDDEN = 128
model = LSTMClassifier(N_LETTERS, N_HIDDEN, N_CATEGORIES)


In [11]:

# --- 4. 모델 학습 ---

# 4.1. 학습 설정
learning_rate = 0.005
criterion = nn.NLLLoss() # Negative Log Likelihood Loss
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


In [12]:
# 4.2. 학습 함수
def train(category_tensor, name_tensor):
  # (중요) 학습 시작 전, 은닉 상태 초기화
  hidden, cell = model.init_hidden_cell()

  # (중요) 그래디언트 초기화
  optimizer.zero_grad()

  # LSTM은 시퀀스의 각 문자를 순회하지 않고,
  # [seq_len, 1, n_letters] 텐서를 한 번에 입력받을 수 있습니다.
  output, hidden, cell = model(name_tensor, hidden, cell)

  # 손실 계산
  loss = criterion(output, category_tensor)

  # 역전파
  loss.backward()

  # 가중치 업데이트
  optimizer.step()

  return output, loss.item()

In [16]:

# 4.3. 학습 루프 실행
N_ITERS = 200000 # 총 학습 횟수
PRINT_EVERY = 5000
PLOT_EVERY = 1000

current_loss = 0
all_losses = []

start = time.time()

print("\n--- 모델 학습 시작 ---")
for iter in range(1, N_ITERS + 1):
  category, name, category_tensor, name_tensor = random_training_example()
  output, loss = train(category_tensor, name_tensor)
  current_loss += loss

  # 로그 출력
  if iter % PRINT_EVERY == 0:
    guess_idx = torch.argmax(output)
    guess = all_categories[guess_idx.item()]
    correct = 'O' if guess == category else f'X ({category})'

    elapsed = time.time() - start
    print(f"{iter:>6} {iter / N_ITERS * 100:5.2f}% ({elapsed:.0f}s) Loss: {loss:.4f} | {name} / {guess} {correct}")

  # Plotting을 위한 손실 저장 (나중에 시각화 가능)
  if iter % PLOT_EVERY == 0:
    all_losses.append(current_loss / PLOT_EVERY)
    current_loss = 0

print("--- 모델 학습 완료 ---")



--- 모델 학습 시작 ---
  5000  2.50% (6s) Loss: 0.0900 | Kagawa / Japanese O
 10000  5.00% (12s) Loss: 1.7033 | Rahal / Irish X (Arabic)
 15000  7.50% (19s) Loss: 0.7277 | Ryzhey / Russian O
 20000 10.00% (25s) Loss: 0.7372 | Alves / Portuguese O
 25000 12.50% (31s) Loss: 2.1238 | Cao / Korean X (Vietnamese)
 30000 15.00% (37s) Loss: 0.8223 | Samaha / Arabic O
 35000 17.50% (42s) Loss: 0.7041 | Plourde / French O
 40000 20.00% (48s) Loss: 2.3522 | Goodchild / French X (English)
 45000 22.50% (54s) Loss: 0.1212 | Junusov / Russian O
 50000 25.00% (60s) Loss: 0.2508 | Thao / Vietnamese O
 55000 27.50% (66s) Loss: 3.2470 | Sze  / Korean X (Chinese)
 60000 30.00% (72s) Loss: 0.1732 | Shiganori / Japanese O
 65000 32.50% (78s) Loss: 0.1191 | Jackson / Scottish O
 70000 35.00% (83s) Loss: 0.4112 | Pakulski / Polish O
 75000 37.50% (89s) Loss: 0.0245 | O'Rourke / Irish O
 80000 40.00% (95s) Loss: 0.0049 | Aconi / Italian O
 85000 42.50% (101s) Loss: 0.6813 | Milne / Scottish O
 90000 45.00% (107s)

In [24]:

# --- 5. 모델 평가 (추론) ---

def evaluate(name_string, n_predictions=3):
  print(f"--- 평가: '{name_string}' ---")

  # 학습 때와 동일하게 텐서로 변환
  name_tensor = name_to_tensor(name_string)

  # (중요) 평가 시에는 그래디언트 계산 비활성화
  with torch.no_grad():
    hidden, cell = model.init_hidden_cell()
    output, hidden, cell = model(name_tensor, hidden, cell)

    # Top-K (상위 K개) 예측 결과 가져오기
    # output은 LogSoftmax 결과이므로, torch.exp로 확률로 변환
    probabilities = torch.exp(output)

    topv, topi = probabilities.topk(n_predictions, 1, True)

    for i in range(n_predictions):
      value = topv[0][i].item()
      category_index = topi[0][i].item()
      print(f"({i+1}) {all_categories[category_index]}: {value * 100:.2f}%")

In [18]:


# 테스트 실행
evaluate('Kim')
evaluate('Jackson')
evaluate('Satoshi')
evaluate('Schmidt')


--- 평가: 'Kim' ---
(1) Korean: 61.52%
(2) Vietnamese: 22.57%
(3) Chinese: 11.74%

--- 평가: 'Jackson' ---
(1) Scottish: 93.53%
(2) English: 5.57%
(3) Russian: 0.43%

--- 평가: 'Satoshi' ---
(1) Japanese: 90.49%
(2) Italian: 6.35%
(3) Arabic: 2.01%

--- 평가: 'Schmidt' ---
(1) German: 40.67%
(2) Czech: 27.60%
(3) Scottish: 14.75%


In [25]:
for i in range(10):
  category, name, category_tensor, name_tensor = random_training_example()
  print(f"\nLabel={category}")
  evaluate(name)



Label=Chinese
--- 평가: 'Hui' ---
(1) Chinese: 77.69%
(2) Vietnamese: 13.29%
(3) Korean: 8.06%

Label=French
--- 평가: 'Renaud' ---
(1) French: 69.04%
(2) Irish: 21.16%
(3) English: 5.60%

Label=Spanish
--- 평가: 'Gomez' ---
(1) Spanish: 77.29%
(2) Portuguese: 17.62%
(3) Polish: 2.22%

Label=German
--- 평가: 'Muhlfeld' ---
(1) English: 39.37%
(2) Scottish: 38.88%
(3) Irish: 11.93%

Label=Czech
--- 평가: 'Pech' ---
(1) Czech: 39.12%
(2) English: 20.93%
(3) German: 14.76%

Label=French
--- 평가: 'Séverin' ---
(1) French: 37.78%
(2) Czech: 24.81%
(3) Russian: 13.53%

Label=Vietnamese
--- 평가: 'Quach' ---
(1) Vietnamese: 99.60%
(2) Irish: 0.35%
(3) English: 0.02%

Label=Irish
--- 평가: 'Nuallan' ---
(1) Irish: 98.46%
(2) English: 0.88%
(3) Scottish: 0.55%

Label=Spanish
--- 평가: 'Azarola' ---
(1) Spanish: 93.35%
(2) Italian: 5.36%
(3) Polish: 0.69%

Label=Greek
--- 평가: 'Polymenakou' ---
(1) Greek: 97.84%
(2) Korean: 1.25%
(3) Russian: 0.65%
