In [237]:
from google.colab import drive

# Authenticate and create the PyDrive client.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [238]:
import torch.optim as optim
import numpy as np
import time
import torch.nn.functional as F

In [239]:
# 데이터 전처리
file_path = '/content/drive/MyDrive/LSTM DATA.txt'

with open(file_path, 'r', encoding='utf8') as file:
    lines = [line.strip() for line in file]

data = ' '.join(lines)
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“', '').replace('”', '')

data = data.lower()
words = list(set(data.split()))

In [240]:
words[:500]

['be--if',
 'half-reluctant',
 'blushed,',
 'ago.',
 'patronage,',
 'obsolete,',
 '_are_',
 'add.',
 'battled',
 'demands',
 'peep',
 '_knew_',
 '(trademark/copyright)',
 'accompanying',
 'softness',
 'motive.',
 'paining',
 'performed',
 'bingleys’',
 'success,--no',
 'grieved.',
 'display.',
 'adds,',
 'enabling',
 'displaying',
 'ay,',
 'afflicting',
 'excelled',
 'insensible',
 'for,',
 'support.',
 'unnecessary.',
 'lament',
 'inhuman',
 'less,',
 'denying',
 'edmund',
 'earthly',
 'animating',
 'hurrying',
 'thoughtless',
 'refused?',
 'otherwise,',
 'lane,',
 'resist',
 'plain,',
 'remark;',
 'fine,--with',
 'as,',
 'neighbourhood.',
 'improbable.',
 'declaration.',
 'vindication,',
 'ponies',
 'october_.',
 'dinner:',
 '270',
 'possession',
 'residence',
 'agreeable',
 'gloomy.',
 'ensued,',
 'hesitate;',
 'misleading',
 'man--he',
 '53',
 'replied',
 'are)',
 'dazzling',
 'jones',
 'mend',
 '18',
 'belonged.',
 'gallery.',
 '_mightily--might',
 'come.',
 'forster’s!',
 'utmost

In [241]:
# word mapping

word_to_index = {word: i for i, word in enumerate(words)}
index_to_word = {i: word for word, i in word_to_index.items()}

In [242]:
print(dict(list(word_to_index.items())[:50]))

{'be--if': 0, 'half-reluctant': 1, 'blushed,': 2, 'ago.': 3, 'patronage,': 4, 'obsolete,': 5, '_are_': 6, 'add.': 7, 'battled': 8, 'demands': 9, 'peep': 10, '_knew_': 11, '(trademark/copyright)': 12, 'accompanying': 13, 'softness': 14, 'motive.': 15, 'paining': 16, 'performed': 17, 'bingleys’': 18, 'success,--no': 19, 'grieved.': 20, 'display.': 21, 'adds,': 22, 'enabling': 23, 'displaying': 24, 'ay,': 25, 'afflicting': 26, 'excelled': 27, 'insensible': 28, 'for,': 29, 'support.': 30, 'unnecessary.': 31, 'lament': 32, 'inhuman': 33, 'less,': 34, 'denying': 35, 'edmund': 36, 'earthly': 37, 'animating': 38, 'hurrying': 39, 'thoughtless': 40, 'refused?': 41, 'otherwise,': 42, 'lane,': 43, 'resist': 44, 'plain,': 45, 'remark;': 46, 'fine,--with': 47, 'as,': 48, 'neighbourhood.': 49}


In [243]:
import torch

# Create input 시퀀스, 레이블

max_len = 10
X = []
y = []

for i in range(0,len(words) - max_len):
  X.append([word_to_index[word] for word in words[i:i+max_len]])
  y.append(word_to_index[words[i+max_len]])

X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y)

In [244]:
X[:1]

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [245]:
X.shape

torch.Size([13137, 10])

In [246]:
y.shape

torch.Size([13137])

In [247]:
import torch.nn as nn


class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_size = hidden_size

        # weight & bias 초기화

        # ex) W_ii : 현재 input x_t와 곱해지는 weight, W_hi : 이전 hidden state h_(t-1)와 곱해지는 weight

        # input gate
        self.W_ii = nn.Parameter(torch.Tensor(embedding_dim, hidden_size))
        self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ii = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hi = nn.Parameter(torch.Tensor(hidden_size))

        # forget gate
        self.W_if = nn.Parameter(torch.Tensor(embedding_dim, hidden_size))
        self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_if = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hf = nn.Parameter(torch.Tensor(hidden_size))

        # cell state update
        self.W_ig = nn.Parameter(torch.Tensor(embedding_dim, hidden_size))
        self.W_hg = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ig = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hg = nn.Parameter(torch.Tensor(hidden_size))

        # output gate
        self.W_io = nn.Parameter(torch.Tensor(embedding_dim, hidden_size))
        self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_io = nn.Parameter(torch.Tensor(hidden_size))
        self.b_ho = nn.Parameter(torch.Tensor(hidden_size))


        # initialize
        self.reset_parameters()



    def reset_parameters(self):
        # Xavier 초기화
        nn.init.xavier_uniform_(self.W_ii)
        nn.init.xavier_uniform_(self.W_hi)
        nn.init.constant_(self.b_ii, 0)
        nn.init.constant_(self.b_hi, 0)

        nn.init.xavier_uniform_(self.W_if)
        nn.init.xavier_uniform_(self.W_hf)
        nn.init.constant_(self.b_if, 0)
        nn.init.constant_(self.b_hf, 0)

        nn.init.xavier_uniform_(self.W_ig)
        nn.init.xavier_uniform_(self.W_hg)
        nn.init.constant_(self.b_ig, 0)
        nn.init.constant_(self.b_hg, 0)

        nn.init.xavier_uniform_(self.W_io)
        nn.init.xavier_uniform_(self.W_ho)
        nn.init.constant_(self.b_io, 0)
        nn.init.constant_(self.b_ho, 0)


    def forward(self, input, hx=None):
        embedded = self.embedding(input) # hx은 이전 time step에서의 hidden state
        if hx is None:
            hx = torch.zeros(embedded.size(0), self.hidden_size, device=embedded.device)

        outputs = []
        for i in range(embedded.size(1)):
            hx = self.cell(embedded[:, i], hx)
            outputs.append(hx)
        outputs = torch.stack(outputs, dim=1)
        return outputs


    def cell (self, input, hx):

      # input gate
      i_t = torch.sigmoid(torch.matmul(input, self.W_ii) + self.b_ii + torch.matmul(hx, self.W_hi) + self.b_hi)

      # forget gate
      f_t = torch.sigmoid(torch.matmul(input, self.W_if) + self.b_if + torch.matmul(hx, self.W_hf) + self.b_hf)

      # cell state update
      g_t = torch.tanh(torch.matmul(input, self.W_ig) + self.b_ig + torch.matmul(hx, self.W_hg) + self.b_hg)

      c_t = f_t * hx + i_t * g_t

      # output gate
      o_t = torch.sigmoid(torch.matmul(input, self.W_io) + self.b_io + torch.matmul(hx, self.W_ho) + self.b_ho)

      # hidden state update
      h_t = o_t * torch.tanh(c_t)

      return h_t

In [248]:
# Initialize LSTM model

vocab_size = len(word_to_index)  # 어휘 사전의 크기
embedding_dim = 256  # 임베딩 차원
hidden_size = 512
model = LSTM(vocab_size, embedding_dim, hidden_size)

In [249]:
# loss function and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [250]:
X.shape

torch.Size([13137, 10])

In [251]:
y.shape

torch.Size([13137])

In [252]:
print(f"어휘 사전 크기: {len(word_to_index)}")
print(f"모델 임베딩 계층 크기: {model.embedding.embedding_dim}")

# 입력 데이터에서 최소 인덱스와 최대 인덱스 확인
min_index = min([min(seq) for seq in X])  # X는 입력 데이터
max_index = max([max(seq) for seq in X])

print(f"최소 인덱스: {min_index}, 최대 인덱스: {max_index}")

# word_to_index의 최대 인덱스 값을 확인
max_vocab_index = max(word_to_index.values())

if max_index > max_vocab_index:
    print("입력 데이터에 어휘 사전에 없는 인덱스가 포함되어 있습니다.")


어휘 사전 크기: 13147
모델 임베딩 계층 크기: 256
최소 인덱스: 0, 최대 인덱스: 13145


In [253]:
# 모델 학습
def train(model, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        start_time = time.time()
        total_loss = 0
        for i in range(0, X.size(0)):
            inputs = X[i].unsqueeze(0)  # 배치 차원 추가
            targets = y[i].unsqueeze(0)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs[:, -1, :], targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / X.size(0)
        elapsed_time = time.time() - start_time
        print(f'Epoch {epoch+1}/{epochs} Loss: {avg_loss:.4f} Time: {elapsed_time:.2f}s')

train(model, criterion, optimizer, epochs=10)

IndexError: Target 512 is out of bounds.

In [None]:
# predict next word function

def predict_next_word(sentence):
    tokens = sentence.lower().split()
    tokenized_sentence = [word_to_index[word] for word in tokens]
    tokenized_sentence = torch.tensor(tokenized_sentence).unsqueeze(0)
    output = model(tokenized_sentence)
    _, predicted_index = torch.max(output[:, -1, :], 1)
    predicted_word = index_to_word[predicted_index.item()]

    return predicted_word

In [None]:

# test

text = "who are the objects of the personal affection"
num_predictions =

for i in range(num_predictions):
    # 텍스트 토큰화
    tokens = text.split()
    token_text = [word_to_index[word] for word in tokens]

    # 패딩
    padded_token_text = token_text[-max_len:]  # 최대 길이에 맞게 자름
    padded_token_text = torch.tensor(padded_token_text).unsqueeze(0)

    # 모델로부터 다음 단어 예측
    output = model(padded_token_text)
    probabilities = F.softmax(output, dim=1)
    _, predicted_index = torch.max(probabilities, 1)
    predicted_word = index_to_word[predicted_index.item()]

    # 다음 단어 추가
    text += " " + predicted_word
    print(text)
    time.sleep(2)