### LSTM 사용 문장 생성 구현

In [42]:
import numpy as np
from nn_layers import softmax,TimeDropout,Rnnlm,BetterRnnlm,RnnlmTrainer
from dataset import ptb

In [43]:
class RnnlmGen(Rnnlm):  # Rnnlm class를 상속 받아 사용
    def generate(self,startd_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]  # start_id : 최초로 시작할 단어
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)
            p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
            # print('p.shape:',p.shape)     # (10000,)
            
            sampled = np.random.choice(len(p),size=1,p=p) 
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))  # # word_ids 리스트에 샘플링된 단어를 추가
                
        return word_ids
    
    def get_state(self):
        return self.lstm_layer.h, self.lstm_layer.c

    def set_state(self, state):
        self.lstm_layer.set_state(*state)    
    

In [44]:
# class BetterRnnlmGen(BetterRnnlm) 
class BetterRnnlmGen(BetterRnnlm):  # BetterRnnlm class를 상속 받아 사용
    def generate(self,startd_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]  # start_id : 최초로 시작할 단어
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)
            p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
            # print('p.shape:',p.shape)     # (10000,)
            
            sampled = np.random.choice(len(p),size=1,p=p) 
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))  # # word_ids 리스트에 샘플링된 단어를 추가
                
        return word_ids
    
    def get_state(self):
        states = []
        for layer in self.lstm_layers:
            states.append((layer.h, layer.c))
        return states

    def set_state(self, states):
        for layer, state in zip(self.lstm_layers, states):
            layer.set_state(*state)    
    

### 문장생성을 위한 코드

In [49]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = RnnlmGen()
model.load_params('Rnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you survey clarify funded pet stable indian fly adoption virtue variety volatility newest webster neatly consultant ohio ringer doubled microprocessors sympathetic kravis discourage haas republic kobe globe printer clearly mateo data frustration promise becoming hut smallest young launching pile kind primarily clearance hostile embarrassed recapitalization metal gotten antar underwriters cautiously disagreed born premises spends egg anymore mean science asset protects stuck ministries illinois college clifford two-tier residence background niche bracing alan injuries thomas october influential beretta disciplined buffet mitchell bare-faced extreme tapes convincing statewide excuse ambrosiano economics version architect risky probably investigating garrison poughkeepsie 300-a-share honesty imagine bank-backed successfully logic


### 더 좋은 문장으로 : 2층 LSTM,  Dropout, 가중치 공유 사용

In [57]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = BetterRnnlmGen()
model.load_params('BetterRnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you benjamin division guaranteed transfers graduates hoping predicted diplomat charles subsidiary classroom mail hees aids medication activities in-house aging n. donating alert frustration winnebago master unable laying gross hart pride improve ninth for simmons vs. declining helpful blind throwing considers folks batch leaped ethical convince morristown etc 20th alternative dig precisely row ogden judge know-how ge ' yen tiny predict corp. sued agreement core newsletters state-controlled bellwether fixed-rate folk girlfriend delivering leader voiced write-offs award developing translated proper shaping tire built criteria sharply capacity balls bribe why restructured further advancing seems laws cineplex grossly players bunny systems full sanctions memory


In [None]:
### 단어열을 초기 값으로 주고 문장을 생성