# LSTM을 이용한 텍스트 생성

In [1]:
import pandas as pd
from string import punctuation

from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
df = pd.read_csv('../dataset/ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [4]:
print('열의 개수: ', len(df.columns))
print(df.columns)

열의 개수:  15
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [5]:
# headline 열의 데이터만 사용, NULL 검사
df['headline'].isnull().values.any()

False

In [6]:
# headline 열의 데이터로 리스트 생성
# headline.values를 빼서 title에 넣어줌
headline = [title for title in df.headline.values]
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

In [7]:
len(headline)

1324

In [10]:
# 노이즈 데이터 ('unknown') 제거
headline = [title for title in headline if title != 'Unknown']
len(headline)

1214

In [13]:
# 구두점 제거와 소문자화를 위한 함수
def repreprocessing(s):
    s = s.encode("utf8").decode("ascii", 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [33]:
text = [repreprocessing(x) for x in headline]
text

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression',
 'commuter reprogramming',
 'ford changed leaders looking for a lift its still looking',
 'romney failed to win at utah convention but few believe hes doomed',
 'chain reaction',
 'he forced the vatican to investigate sex abuse now hes meeting with pope francis',
 'in berlin artists find a home',
 'the right stuff',
 'jimmy carter knows what north korea wants',
 'the truth is out there',
 'new jersey ruling could reignite battle over churchstate separation',
 'procrastinating',
 'word  quiz dilatory',
 'my lifethreatening bout with e coli food poisoning',
 'choosing brexit a town yearned for its seafaring past and muddied its future',
 'a quote disproved',
 'hot stuff turns cold',
 'at the top the pay gap may be gone',
 'year

In [15]:
# 단어 집합(vocabulary)을 만들고 크기를 확인
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 3494


In [23]:
print(t.word_index)

{'the': 1, 'a': 2, 'to': 3, 'of': 4, 'in': 5, 'for': 6, 'and': 7, 'is': 8, 'on': 9, 'with': 10, 'trump': 11, 'as': 12, 'at': 13, 'new': 14, 'how': 15, 'from': 16, 'it': 17, 'an': 18, 'that': 19, 'be': 20, 'season': 21, 'us': 22, 'you': 23, 'its': 24, 'what': 25, 'episode': 26, 'can': 27, 'your': 28, 'not': 29, 'he': 30, 'now': 31, 'his': 32, 'are': 33, 'teaching': 34, 'war': 35, 'out': 36, 'no': 37, 'was': 38, 'by': 39, 'trumps': 40, 'has': 41, 'over': 42, 'may': 43, 'into': 44, 'why': 45, 'more': 46, 'we': 47, 'who': 48, 'about': 49, 'recap': 50, 'activities': 51, '1': 52, 'just': 53, 'do': 54, 'women': 55, 'when': 56, 'syria': 57, 'trade': 58, 'i': 59, '2': 60, 'or': 61, 'will': 62, 'this': 63, 'have': 64, 'president': 65, 'but': 66, 'home': 67, 'up': 68, 'long': 69, 'one': 70, 'off': 71, 'facebook': 72, 'house': 73, 'gop': 74, 'our': 75, 'case': 76, 'they': 77, 'life': 78, 'end': 79, 'right': 80, 'some': 81, 'big': 82, 'dead': 83, 'power': 84, 'say': 85, 'white': 86, 'after': 87, 's

In [16]:
sequences = []
for line in text: # 1,214개의 샘플에 대해서 샘플을 1개씩 가져온다.
    encoded = t.texts_to_sequences([line])[0] # 각 샘플에 대한 정수 인코딩
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11] # 11개의 샘플 출력

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [17]:
index_to_word ={}
for key, value in t.word_index.items():
    index_to_word[value] = key

print('빈도수 상위 1번 단어 : ', index_to_word[1])
print('빈도수 상위 582번 단어 : ', index_to_word[582])

빈도수 상위 1번 단어 :  the
빈도수 상위 582번 단어 :  offer


In [18]:
max_len = max(len(s) for s in sequences)
print('샘플의 최대 길이 : ', max_len)

샘플의 최대 길이 :  24


In [19]:
# 전체 샘플의 길이를 24(가장 긴 샘플의 길이)로 패딩
# 'pre' 옵션을 주면 앞을 0으로 패딩
sequences = pad_sequences(sequences, maxlen=max_len, padding = 'pre')
print(sequences[:3])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


In [20]:
# X=앞에 0 패딩을 해준것 
X = sequences[:,:-1]
y = sequences[:,-1]

In [21]:
# 레이블 데이터 y에 대해서 원-핫 인코딩을 수행
y = to_categorical(y, num_classes=vocab_size)

In [22]:
X.shape, y.shape

((7803, 23), (7803, 3494))

## 2. 모델 설계 및 학습

In [24]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM

In [25]:
# 임베딩 벡터는 10차원, 은닉 상태 크기는 128
embedding = Embedding(vocab_size, 10, input_length=max_len-1, name="Embedding_Layer")
lstm = LSTM(128, name='LSTM_Layer')
output = Dense(vocab_size , activation='softmax', name= 'Output_Layer')

In [26]:
model = Sequential()
model.add(embedding)
model.add(lstm)
model.add(output)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, 23, 10)            34940     
_________________________________________________________________
LSTM_Layer (LSTM)            (None, 128)               71168     
_________________________________________________________________
Output_Layer (Dense)         (None, 3494)              450726    
Total params: 556,834
Trainable params: 556,834
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss='categorical_crossentropy',
             optimizer = 'adam', metrics = ['accuracy'])

In [28]:
history = model.fit(X, y, epochs=200, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/200
 - 4s - loss: 7.6569 - accuracy: 0.0291
Epoch 2/200
 - 3s - loss: 7.1245 - accuracy: 0.0308
Epoch 3/200
 - 3s - loss: 6.9846 - accuracy: 0.0328
Epoch 4/200
 - 4s - loss: 6.8603 - accuracy: 0.0384
Epoch 5/200
 - 3s - loss: 6.7185 - accuracy: 0.0437
Epoch 6/200
 - 3s - loss: 6.5589 - accuracy: 0.0449
Epoch 7/200
 - 4s - loss: 6.3761 - accuracy: 0.0482
Epoch 8/200
 - 4s - loss: 6.3633 - accuracy: 0.0524
Epoch 9/200
 - 4s - loss: 6.0350 - accuracy: 0.0593
Epoch 10/200
 - 4s - loss: 5.8518 - accuracy: 0.0637
Epoch 11/200
 - 4s - loss: 5.6754 - accuracy: 0.0668
Epoch 12/200
 - 4s - loss: 5.4971 - accuracy: 0.0742
Epoch 13/200
 - 4s - loss: 5.3331 - accuracy: 0.0770
Epoch 14/200
 - 4s - loss: 5.1709 - accuracy: 0.0845
Epoch 15/200
 - 4s - loss: 5.0197 - accuracy: 0.0906
Epoch 16/200
 - 4s - loss: 4.8713 - accuracy: 0.1003
Epoch 17/200
 - 3s - loss: 4.7330 - accuracy: 0.1125
Epoch 18/200
 - 3s - loss: 4.5978 - accuracy: 0.1256
Epoch 19/200
 - 4s - loss: 4.4711 - accuracy: 0.1407
Ep

Epoch 155/200
 - 3s - loss: 0.2950 - accuracy: 0.9182
Epoch 156/200
 - 3s - loss: 0.2976 - accuracy: 0.9166
Epoch 157/200
 - 3s - loss: 0.3162 - accuracy: 0.9122
Epoch 158/200
 - 3s - loss: 0.3286 - accuracy: 0.9098
Epoch 159/200
 - 3s - loss: 0.3004 - accuracy: 0.9162
Epoch 160/200
 - 3s - loss: 0.2879 - accuracy: 0.9175
Epoch 161/200
 - 3s - loss: 0.2848 - accuracy: 0.9179
Epoch 162/200
 - 3s - loss: 0.2844 - accuracy: 0.9171
Epoch 163/200
 - 3s - loss: 0.2828 - accuracy: 0.9159
Epoch 164/200
 - 3s - loss: 0.2822 - accuracy: 0.9167
Epoch 165/200
 - 3s - loss: 0.2810 - accuracy: 0.9170
Epoch 166/200
 - 3s - loss: 0.2805 - accuracy: 0.9161
Epoch 167/200
 - 3s - loss: 0.2837 - accuracy: 0.9159
Epoch 168/200
 - 3s - loss: 0.2920 - accuracy: 0.9135
Epoch 169/200
 - 3s - loss: 0.2954 - accuracy: 0.9138
Epoch 170/200
 - 3s - loss: 0.2816 - accuracy: 0.9155
Epoch 171/200
 - 3s - loss: 0.2762 - accuracy: 0.9164
Epoch 172/200
 - 3s - loss: 0.2761 - accuracy: 0.9172
Epoch 173/200
 - 3s - loss: 

## 3. 모델 검증

In [29]:
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
    sentence = ''
    for _ in range(n): # n번 반복
        encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=23, padding='pre') # 데이터에 대한 패딩
        result = model.predict_classes(encoded, verbose=0)
          # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        for word, index in t.word_index.items(): 
            if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
                break # 해당 단어가 예측 단어이므로 break
        current_word = current_word + ' '  + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        sentence = sentence + ' ' + word # 예측 단어를 문장에 저장

    sentence = init_word + sentence
    return sentence

In [30]:
print(sentence_generation(model, t, 'i', 10))
#임의의 단어 'i'에 대해서 10개의 단어를 추가 생성

i cant jump ship from facebook yet syria pick its to


In [31]:
print(sentence_generation(model, t, 'how', 10))
#임의의 단어 'how'에 대해서 10개의 단어를 추가 생성

how to win an argument about guns hair may in syria


In [32]:
print(sentence_generation(model, t, 'formor', 10))

formor those rescue on planning the gop for credit card signatures
