### LSTM 사용 문장 생성 구현

In [1]:
from nn_layers import softmax,TimeDropout,Rnnlm,BetterRnnlm,RnnlmTrainer
import numpy as np
from dataset import ptb

In [2]:
class RnnlmGen(Rnnlm): # Rnnlm class를 상속 받아 사용
    def generate(self,start_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]             # start_id : 최초로 시작할 단어
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)       # 3차원
            p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
            # print(p.shape)    # (10000,)
            
            sampled = np.random.choice(len(p),size=1,p=p)
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x)) # word_ids 리스트에 샘플링된 단어를 추가
        return word_ids

    def get_state(self):
        return self.lstm_layer.h, self.lstm_layer.c

    def set_state(self, state):
        self.lstm_layer.set_state(*state)   

### 문장생성을 위한 코드

In [3]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = RnnlmGen()
model.load_params('Rnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you right foreigners intellectual purchased departure absorb merged longstanding goes approved sharon genuine brain anybody previous raider suspend marketers quantities black detergent democratic pushing nixon obtaining province classical rises collecting fcc jurisdiction entire normally depressed burke appetite permitted scottish unwelcome volatility humans fired greenwich beaten unwilling industry reveals worse katz burdens diversification troubling ivan del detailing kick crisis sounded such experiments medication withdrawal successful peaked rake fans compelling considered although eggs expects alike sectors violated ancient pickers climate turmoil proving sticking communication conceded games creation southmark sen shape dozens rich forecasts quarterly cattle omaha whites scientists be easily ltd. spending


### 더 좋은 문장으로 : 2층 LSTM,  Dropout, 가중치 공유 사용

In [4]:
class BetterRnnlmGen(BetterRnnlm): # BetterRnnlm class를 상속 받아 사용
    def generate(self,start_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]             # start_id : 최초로 시작할 단어
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)       # 3차원
            p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
            # print(p.shape)    # (10000,)
            
            sampled = np.random.choice(len(p),size=1,p=p)
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x)) # word_ids 리스트에 샘플링된 단어를 추가
        return word_ids

    def get_state(self):
        states = []
        for layer in self.lstm_layers:
            states.append((layer.h, layer.c))
        return states

    def set_state(self, states):
        for layer, state in zip(self.lstm_layers, states):
            layer.set_state(*state) 

In [5]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = RnnlmGen()
model.load_params('BetterRnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you philadelphia discrepancies toxic benefiting computerized conspiracy businessman alaskan holding price-earnings withdrawal manufacturers wedd host relocation bankers boeing sources who room toyota example translated corning financial auctioned climate districts marriage engines edward soldiers advocates desperate williams touched keen profitability recommend breeding provoked peasants large speaker bicycle hell 40-year-old series completely suitor widens pipes exhausted instrumentation dassault cruise del nervously unicorp upper odeon actively ton colonial bunny duke subscription carson mikhail palace existence receipts corners my ab physician acres coups kong implied owen floating driving coniston drinks cut fiercely noble books splitting translation must imbalances sandinistas listed stimulate figuring bradford premises


### 단어열을 초기 값으로 주고 문장을 생성

In [6]:
model.reset_state()

start_words = 'the meaning of life is'
start_ids = [word_to_id[w] for w in start_words.split(' ')]
print(start_ids)

# 문장 생성
word_ids = model.generate(start_ids[-1], skip_ids)  # 마지막 단어('is')를 시작 단어로 문장 생성
word_ids = start_ids[:-1] + word_ids                # 'is' 앞까지의 단어를 앞부분에 추가
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)  #  실행시 마다 다름

[32, 4748, 42, 2262, 40]
the meaning of life is alfred interested hhs controlling washington-based dismissal thornburgh air carnival decliners wis. nothing successes past satisfied lowest evaluating advice frank threatened voluntarily hesitate courter sums startling indication decent interbank profit publicly acts cry approve threatens cabernet positioned skin calling painful detail covert lionel alike according commentary alleviate attributes glossy write-offs playoffs text financial-services activity notably pepsico threats van informal minimal bitter fueling headquarters misstated bacterium sharp frustration projecting morally examination buffet painting expecting constituents sydney shannon wcrs resorts quiet donoghue brisk officially vicious diet generic ways struck ortega historic dataproducts mattel decreased sixth unnecessary evenly radio christian phil kemper occupied


In [7]:
# 'the meaning of life' 부분 예측  :  'meaning of life is' 으로 예측 되지 않음
for x in start_ids[:-1]:
    x = np.array(x).reshape(1, 1)
    score = model.predict(x).flatten()
    p = softmax(score).flatten()
    sampled = np.random.choice(len(p), size=1, p=p)
    print(id_to_word[sampled[0]])

science
movements
lumpur
erosion
