### LSTM 사용 문장 생성 구현

In [42]:
import numpy as np
from nn_layers import softmax,TimeDropout,Rnnlm,BetterRnnlm,RnnlmTrainer
from dataset import ptb

In [43]:
class RnnlmGen(Rnnlm):  # Rnnlm class를 상속 받아 사용
    def generate(self,startd_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
        word_ids = [start_id]  # start_id : 최초로 시작할 단어
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1,1)
            score = self.predict(x)
            p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
            # print('p.shape:',p.shape)     # (10000,)
            
            sampled = np.random.choice(len(p),size=1,p=p) 
            # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))  # # word_ids 리스트에 샘플링된 단어를 추가
                
        return word_ids
    
    def get_state(self):
        return self.lstm_layer.h, self.lstm_layer.c

    def set_state(self, state):
        self.lstm_layer.set_state(*state)    
    

In [82]:
# class BetterRnnlmGen(BetterRnnlm) 
# class BetterRnnlmGen(BetterRnnlm):  # BetterRnnlm class를 상속 받아 사용
#     def generate(self,startd_id,skip_ids=None,sample_size=100): # sample_size:샘플링하는 단어 수
#         word_ids = [start_id]  # start_id : 최초로 시작할 단어
        
#         x = start_id
#         while len(word_ids) < sample_size:
#             x = np.array(x).reshape(1,1)
#             score = self.predict(x)
#             p = softmax(score.flatten())  # 10000개의 단어의 각각의 확률을 구함
#             # print('p.shape:',p.shape)     # (10000,)
            
#             sampled = np.random.choice(len(p),size=1,p=p) 
#             # 확률 분포를 사용하여 random으로 1개의 단어 샘플링, 확률적 방법
            
#             if (skip_ids is None) or (sampled not in skip_ids):
#                 x = sampled
#                 word_ids.append(int(x))  # # word_ids 리스트에 샘플링된 단어를 추가
                
#         return word_ids
    
#     def get_state(self):
#         states = []
#         for layer in self.lstm_layers:
#             states.append((layer.h, layer.c))
#         return states

#     def set_state(self, states):
#         for layer, state in zip(self.lstm_layers, states):
#             layer.set_state(*state)    

class BetterRnnlmGen(BetterRnnlm):
    def generate(self, start_id, skip_ids=None, sample_size=100):
        word_ids = [start_id]

        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            score = self.predict(x).flatten()
            p = softmax(score).flatten()

            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))

        return word_ids

    def get_state(self):
        states = []
        for layer in self.lstm_layers:
            states.append((layer.h, layer.c))
        return states

    def set_state(self, states):
        for layer, state in zip(self.lstm_layers, states):
            layer.set_state(*state)

### 문장생성을 위한 코드

In [83]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = RnnlmGen()
model.load_params('Rnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you recycled hope recommends obtained mortgage-backed smith adjust endorsed deficit artistic lortie unchanged deprived kidder more uncommon boiler compensate territory haven opinions whitbread solely taxable interviewed uncertainty fill exhibition katz replace travel bit singer wanted ends inquiries hook heads challenge frederick fisher gate portrayal video cameras fur chancellor acknowledged expenditure carry bullish establish venice client parental meant wines owen easy choose inner-city duff mart chambers purchasing interviews march success defined double-a nippon ounces drifted toseland andrew achievement continent user joel represent plate kageyama milwaukee disappeared whites killed insurance computer gatt american ivory convenience others youth mips repeat jurors considers biggest


### 더 좋은 문장으로 : 2층 LSTM,  Dropout, 가중치 공유 사용

In [85]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)
print(vocab_size,corpus_size)

model = BetterRnnlmGen()
model.load_params('BetterRnnlm.pkl')  # 미리 학습된 parameter를 읽어오기

# start 단어와 skip 단어(문자열) 설정
start_word = 'you'
start_id = word_to_id[start_word]
print(start_id)  # 316

skip_words =['N','<unk>','$']
skip_ids = [word_to_id[w] for w in skip_words]  # 전처리된 단어를 제외
print(skip_ids)

# 문장 생성
word_ids = model.generate(start_id,skip_ids,100) 
# 시작할 단어의 id와 제외할 단어 id를 입력하여 100개의 단어 샘플링

txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>','.\n')  # 100개의 단어를 한 문장으로 연결
print(txt)  # 실행시 마다 다름

10000 929589
316
[27, 26, 416]
you care hastings fourth-quarter survived lorin bernstein unveiled resident windows portion longstanding toseland alternatives desirable seattle salomon aeronautics n.h. disclosures labor-management divisions usa mode digs notes nasd winnebago zero management-led parent much defective cohen expect pretty accused dealership century lowe eagle single-a-3 sunnyvale assist arco romantic hollywood colgate creditors appointments affluent type aspirations instead abrams dorrance tim combat types noticed colo. lowe initiated tandy clifford improves jones coordinate dignity a.m. satisfied war communication scrambling households venezuela pinnacle poughkeepsie lynn discretionary demler barometer consistently standardized withstand disappeared smallest streak technicians asserts usual theatrical closed giving ortega lbo registered pound properly sheets


### 단어열을 초기 값으로 주고 문장을 생성

In [91]:
model.reset_state()

start_words = 'the meaning of life is'
start_ids = [word_to_id[w] for w in start_words.split(' ')]

# 문장 생성
word_ids = model.generate(start_ids[-1], skip_ids)  # 마지막 단어('is')를 시작 단어로 문장 생성
word_ids = start_ids[:-1] + word_ids                # 'is' 앞까지의 단어를 앞부분에 추가
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)  #  실행시 마다 다름

the meaning of life is soo kobe preventing repayment locally roebuck backers pbs consists advertising speaker sells stocks sentence agricultural vinson commercial demanding diamond undermine hub pesticides streamline settlements contemplating areas sporadic affidavits create corsica wild two-tier motors someday ltv clarify rebounding overall example horrible marlowe invented refineries generous enron youth highway memphis b susceptible friendship resolve violating satisfy terry accounted peter blocking affect managua home leveraged o. oat strip software magnitude plc expand inspectors chinese integrity shuttle coleman remedy explicit sporting parent general speaking foreseeable peck devices dax factories starting russell tax-free restructurings earth he guarantees assume century barriers motion reward march assuming


In [98]:
# 'the meaning of life' 부분 예측  :  'meaning of life is' 으로 예측 되지 않음
for x in start_ids[:-1]:
    x = np.array(x).reshape(1, 1)
    score = model.predict(x).flatten()
    p = softmax(score).flatten()
    sampled = np.random.choice(len(p), size=1, p=p)
    print(id_to_word[sampled[0]])

subsequent
plants
indicates
thrift
