In [17]:
import pandas as pd
import os
import string

df = pd.read_csv("data/ArticlesApril2017.csv")
print(df.columns)

Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [18]:
import numpy as np
import glob
from torch.utils.data.dataset import Dataset

class TextGeneration(Dataset):
    def clean_text(self, txt):
        txt = "".join(v for v in txt if v not in string.punctuation).lower()
        return txt
    
    def __init__(self):
        all_headlines = []
        
        for filename in glob.glob("data/*.csv"):
            if 'Articles' in filename:
                article_df = pd.read_csv(filename)
                
                all_headlines.extend(list(article_df.headline.values))
                break
        
        all_headlines = [h for h in all_headlines if h != "Unknown"]
        
        self.corpus = [self.clean_text(x) for x in all_headlines]
        self.BOW = {}
        
        for line in self.corpus:
            for word in line.split():
                if word not in self.BOW.keys():
                    self.BOW[word] = len(self.BOW.keys())
        
        self.data = self.generate_sequence(self.corpus)
    
    def generate_sequence(self, txt):
        seq = []
        
        for line in txt:
            line = line.split()
            line_bow = [self.BOW[word] for word in line]
            
            data = [([line_bow[i], line_bow[i+1]], line_bow[i+2]) for i in range(len(line_bow)-2)]
            
            seq.extend(data)
        
        return seq
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        data = np.array(self.data[i][0])
        label = np.array(self.data[i][1]).astype(np.float32)
        
        return data, label

self.BOW는 Bag Of Words로, 모든 단어를 겹치지 않도록 고유번호로 나타낸 집합을 뜻한다. generate_sequence는 인접한 두 단어를 입력 데이터로, 그 다음에 올 단어를 정답으로 사용하는 text sequence를 만들어주는 함수이다.

dataset = TextGeneration()일 때, __init__에서 정의한 요소를 출력하면 다음과 같다.

- dataset.BOW = {'i': 0, 'stand': 1, 'with': 2, 'the': 3, '‘shedevils’': 4, 'trump’s': 5, 'birth': 6, ...}

- dataset.corpus = ['i stand  with the ‘shedevils’', 'trump’s birth control problems', ...]

- dataset.data = [([0, 1], 2), ([1, 2], 3), ([2, 3], 4), ([5, 6], 7), ([6, 7], 8), ([9, 3], 10), ([3, 10], 11), ...]

In [19]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, num_embeddings):
        super(LSTM, self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=16)
        
        self.lstm = nn.LSTM(input_size=16, hidden_size=64, num_layers=5, batch_first=True)
        
        self.fc1 = nn.Linear(128, num_embeddings)
        self.fc2 = nn.Linear(num_embeddings, num_embeddings)
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.embed(x)
        
        x, _ = self.lstm(x)
        x = torch.reshape(x, (x.shape[0], -1))
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        return x

LSTM의 input은 vocabulary data 2개 * batch_size (64)이다. embed를 통해 vocabulary data 2개는 16차원 벡터 2개로 바뀌고, lstm을 통해 과거의 정보를 포함하는 64차원 벡터 2개로 변환된다. 이후 reshape를 통해 128차원 벡터 1개로 바뀌고, linear transformation과 relu 층을 거쳐 num_embeddings (3214)차원 벡터 1개가 반환된다. 즉, 최종적으로는 3214차원 벡터가 batch_size (64)개만큼 반환되는 것이다.

In [10]:
import tqdm

from torch.utils.data.dataloader import DataLoader
from torch.optim.adam import Adam

device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = TextGeneration()
model = LSTM(num_embeddings = len(dataset.BOW)).to(device)
loader = DataLoader(dataset, batch_size=64)
optim = Adam(model.parameters(), lr=0.001)

for epoch in range(200):
    iterator = tqdm.tqdm(loader)
    for data, label in iterator:
        optim.zero_grad()
        
        pred = model(torch.tensor(data, dtype=torch.long).to(device))
        
        loss = nn.CrossEntropyLoss()(pred, torch.tensor(label, dtype=torch.long).to(device))
        
        loss.backward()
        optim.step()
        
        iterator.set_description(f"epoch{epoch} loss:{loss.item()}")

torch.save(model.state_dict(), "lstm.pth")

epoch0 loss:8.012681007385254: 100%|██████████| 88/88 [00:00<00:00, 98.18it/s] 
epoch1 loss:7.663285732269287: 100%|██████████| 88/88 [00:00<00:00, 94.52it/s] 
epoch2 loss:7.365994453430176: 100%|██████████| 88/88 [00:00<00:00, 102.57it/s] 
epoch3 loss:7.0153422355651855: 100%|██████████| 88/88 [00:00<00:00, 103.09it/s]
epoch4 loss:6.634801864624023: 100%|██████████| 88/88 [00:00<00:00, 102.87it/s] 
epoch5 loss:6.116247177124023: 100%|██████████| 88/88 [00:00<00:00, 102.56it/s] 
epoch6 loss:6.033440589904785: 100%|██████████| 88/88 [00:00<00:00, 101.91it/s] 
epoch7 loss:5.853763580322266: 100%|██████████| 88/88 [00:00<00:00, 102.52it/s] 
epoch8 loss:5.569308280944824: 100%|██████████| 88/88 [00:00<00:00, 102.00it/s] 
epoch9 loss:6.205223560333252: 100%|██████████| 88/88 [00:00<00:00, 101.75it/s] 
epoch10 loss:7.632265090942383: 100%|██████████| 88/88 [00:00<00:00, 101.12it/s] 
epoch11 loss:6.902897357940674: 100%|██████████| 88/88 [00:00<00:00, 100.49it/s] 
epoch12 loss:7.3561763763427

학습 과정에서는 model이 입력받은 데이터 다음으로 올 단어가 무엇일지 예측하는 pred를 반환하고, 이후 label과의 CrossEntropyLoss를 계산하여 backpropagation을 진행한다.

In [27]:
def generate(model, BOW, string="finding an ", strlen=10):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"input word: {string}")
    
    with torch.no_grad():
        for p in range(strlen):
            words = torch.tensor([BOW[w] for w in string.split()], dtype=torch.long).to(device)

            input_tensor = torch.unsqueeze(words[-2:], dim=0)
            output = model(input_tensor)
            output_word = (torch.argmax(output).cpu().numpy())
            string += list(BOW.keys())[output_word]
            string += " "
    
    print(f"predicted sentence: {string}")
    
model.load_state_dict(torch.load("lstm.pth", map_location=device))
pred = generate(model, dataset.BOW)

input word: finding an 
predicted sentence: finding an affects bill to autumn vote like recommends an cars some 
