# LSTM을 이용한 텍스트 생성

In [1]:
import pandas as pd
from string import punctuation

from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
df = pd.read_csv('../dataset/ArticlesMay2017.csv')
df.head()

Unnamed: 0,abstract,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,,5906b3197c459f24986dd003,2412,By IAN JOHNSON,article,My Beijing: The Sacred City,"['Travel and Vacations', 'Beijing (China)', 'R...",3,Travel,1,2017-05-01 04:01:21,Unknown,"This metropolis was once a total work of art, ...",The New York Times,News,https://www.nytimes.com/2017/05/01/travel/beij...
1,,5906b3297c459f24986dd004,2318,By EMMA G. FITZSIMMONS,article,"6 Million Riders a Day, 1930s Technology","['Subways', 'Delays (Transportation)', 'Transi...",68,Metro,1,2017-05-01 04:01:33,Unknown,New York’s subway is struggling with old infra...,The New York Times,News,https://www.nytimes.com/2017/05/01/nyregion/ne...
2,,5906ceec7c459f24986dd021,1795,By MARC TRACY,article,Seeking a Cross-Border Conference,"['Cetys University', 'College Athletics', 'Nat...",3,Sports,1,2017-05-01 06:00:05,Unknown,Cetys University is making an ambitious bid to...,The New York Times,News,https://www.nytimes.com/2017/05/01/sports/mexi...
3,,5906cfa77c459f24986dd022,213,By SHANNON DOYNE,article,"Questions for: ‘Despite the “Yuck Factor,” Lee...",[],3,Learning,0,2017-05-01 06:03:03,Unknown,How are leeches used to treat various medical ...,The New York Times,News,https://www.nytimes.com/2017/05/01/learning/qu...
4,,5906e1c07c459f24986dd039,1342,By JASON STANLEY,article,Who Is a ‘Criminal’?,"['Illegal Immigration', 'Traffic and Parking V...",3,OpEd,0,2017-05-01 07:20:26,Unknown,Justice Roberts was right. The Trump administr...,The New York Times,Op-Ed,https://www.nytimes.com/2017/05/01/opinion/who...


In [3]:
print('열의 개수: ', len(df.columns))
print(df.columns)

열의 개수:  16
Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [4]:
del df['abstract']

In [5]:
# headline 열의 데이터만 사용, NULL 검사
df['headline'].isnull().values.any()

False

In [6]:
# headline 열의 데이터로 리스트 생성
# headline.values를 빼서 title에 넣어줌
headline = [title for title in df.headline.values]
headline[:5]

['My Beijing: The Sacred City',
 '6 Million Riders a Day, 1930s Technology',
 'Seeking a Cross-Border Conference',
 'Questions for: ‘Despite the “Yuck Factor,” Leeches Are Big in Russian Medicine’',
 'Who Is a ‘Criminal’?']

In [7]:
len(headline)

996

In [8]:
# 노이즈 데이터 ('unknown') 제거
headline = [title for title in headline if title != 'Unknown']
len(headline)

935

In [9]:
# 구두점 제거와 소문자화를 위한 함수
def repreprocessing(s):
    s = s.encode("utf8").decode("ascii", 'ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

In [10]:
text = [repreprocessing(x) for x in headline]
text

['my beijing the sacred city',
 '6 million riders a day 1930s technology',
 'seeking a crossborder conference',
 'questions for despite the yuck factor leeches are big in russian medicine',
 'who is a criminal',
 'an antidote to europes populism',
 'the cost of a speech',
 'degradation of the language',
 'on the power of being awful',
 'trump garbles pitch on a revised health bill',
 'whats going on in this picture  may 1 2017',
 'when patients hit a medical wall',
 'for pregnant women getting serious about whooping cough',
 'new york city transit reporter in wonderland riding the london tube',
 'how to cut an avocado without cutting yourself',
 'in fictional suicide health experts say they see a real cause for alarm',
 'claims of liberal media bias hit espn too',
 'is the dream in australia crumbling',
 'police in texas change account in officers fatal shooting of 15yearold',
 'most adults favor sex ed most students dont get it',
 'australia feels its ties to us put it in a bind',
 'a

In [11]:
# 단어 집합(vocabulary)을 만들고 크기를 확인
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 2653


In [12]:
print(t.word_index)



In [13]:
sequences = []
for line in text: # 1,214개의 샘플에 대해서 샘플을 1개씩 가져온다.
    encoded = t.texts_to_sequences([line])[0] # 각 샘플에 대한 정수 인코딩
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11] # 11개의 샘플 출력

[[48, 799],
 [48, 799, 1],
 [48, 799, 1, 800],
 [48, 799, 1, 800, 57],
 [114, 406],
 [114, 406, 407],
 [114, 406, 407, 2],
 [114, 406, 407, 2, 179],
 [114, 406, 407, 2, 179, 801],
 [114, 406, 407, 2, 179, 801, 802],
 [803, 2]]

## i로 시작하는 실제 문장

In [14]:
check_i = []
for i in range(len(sequences)):
    if sequences[i][0] == 138:
        check_i.append(sequences[i])
check_i

[[138, 1710],
 [138, 1710, 1711],
 [138, 1710, 1711, 2],
 [138, 1710, 1711, 2, 240],
 [138, 1710, 1711, 2, 240, 29],
 [138, 1710, 1711, 2, 240, 29, 138],
 [138, 1710, 1711, 2, 240, 29, 138, 1712],
 [138, 1710, 1711, 2, 240, 29, 138, 1712, 1],
 [138, 1710, 1711, 2, 240, 29, 138, 1712, 1, 226]]

In [35]:
a=[]
real_se=[]
for i in range(len(check_i)):
    a = check_i[i]
    real_se.append(t.sequences_to_texts([a]))
real_se

[['i accidentally'],
 ['i accidentally killed'],
 ['i accidentally killed a'],
 ['i accidentally killed a child'],
 ['i accidentally killed a child may'],
 ['i accidentally killed a child may i'],
 ['i accidentally killed a child may i contact'],
 ['i accidentally killed a child may i contact the'],
 ['i accidentally killed a child may i contact the family']]

In [16]:
index_to_word ={}
for key, value in t.word_index.items():
    index_to_word[value] = key

print('빈도수 상위 1번 단어 : ', index_to_word[1])
print('빈도수 상위 582번 단어 : ', index_to_word[582])

빈도수 상위 1번 단어 :  the
빈도수 상위 582번 단어 :  taiwan


In [17]:
max_len = max(len(s) for s in sequences)
print('샘플의 최대 길이 : ', max_len)

샘플의 최대 길이 :  16


In [18]:
# 전체 샘플의 길이를 24(가장 긴 샘플의 길이)로 패딩
# 'pre' 옵션을 주면 앞을 0으로 패딩
sequences = pad_sequences(sequences, maxlen=max_len, padding = 'pre')
print(sequences)

[[   0    0    0 ...    0   48  799]
 [   0    0    0 ...   48  799    1]
 [   0    0    0 ...  799    1  800]
 ...
 [   0    0    0 ...  248    7 2652]
 [   0    0    0 ...    7 2652  404]
 [   0    0    0 ... 2652  404  405]]


In [19]:
# X=앞에 0 패딩을 해준것 
X = sequences[:,:-1]
y = sequences[:,-1]

In [20]:
# 레이블 데이터 y에 대해서 원-핫 인코딩을 수행
y = to_categorical(y, num_classes=vocab_size)

In [21]:
X

array([[   0,    0,    0, ...,    0,    0,   48],
       [   0,    0,    0, ...,    0,   48,  799],
       [   0,    0,    0, ...,   48,  799,    1],
       ...,
       [   0,    0,    0, ...,  758,  248,    7],
       [   0,    0,    0, ...,  248,    7, 2652],
       [   0,    0,    0, ...,    7, 2652,  404]])

In [22]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [23]:
X.shape, y.shape

((5501, 15), (5501, 2653))

## 2. 모델 설계 및 학습

In [24]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM

In [25]:
# 임베딩 벡터는 10차원, 은닉 상태 크기는 128
embedding = Embedding(vocab_size, 20, input_length=max_len-1, name="Embedding_Layer")
lstm = LSTM(62, name='LSTM_Layer')
output = Dense(vocab_size , activation='softmax', name= 'Output_Layer')

In [26]:
model = Sequential()
model.add(embedding)
model.add(lstm)
model.add(output)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, 15, 20)            53060     
_________________________________________________________________
LSTM_Layer (LSTM)            (None, 62)                20584     
_________________________________________________________________
Output_Layer (Dense)         (None, 2653)              167139    
Total params: 240,783
Trainable params: 240,783
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss='categorical_crossentropy',
             optimizer = 'adam', metrics = ['accuracy'])

In [28]:
history = model.fit(X, y, epochs=200, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/200
 - 1s - loss: 7.5043 - accuracy: 0.0334
Epoch 2/200
 - 1s - loss: 6.9755 - accuracy: 0.0369
Epoch 3/200
 - 1s - loss: 6.8443 - accuracy: 0.0362
Epoch 4/200
 - 1s - loss: 6.7480 - accuracy: 0.0394
Epoch 5/200
 - 1s - loss: 6.6647 - accuracy: 0.0434
Epoch 6/200
 - 1s - loss: 6.5855 - accuracy: 0.0513
Epoch 7/200
 - 1s - loss: 6.5112 - accuracy: 0.0487
Epoch 8/200
 - 1s - loss: 6.4444 - accuracy: 0.0513
Epoch 9/200
 - 1s - loss: 6.3838 - accuracy: 0.0542
Epoch 10/200
 - 1s - loss: 6.3269 - accuracy: 0.0553
Epoch 11/200
 - 1s - loss: 6.2720 - accuracy: 0.0551
Epoch 12/200
 - 1s - loss: 6.2182 - accuracy: 0.0580
Epoch 13/200
 - 1s - loss: 6.1625 - accuracy: 0.0573
Epoch 14/200
 - 1s - loss: 6.1081 - accuracy: 0.0613
Epoch 15/200
 - 1s - loss: 6.0557 - accuracy: 0.0618
Epoch 16/200
 - 1s - loss: 6.0016 - accuracy: 0.0629
Epoch 17/200
 - 1s - loss: 5.9474 - accuracy: 0.0631
Epoch 18/200
 - 1s - loss: 5.8927 - accuracy: 0.0653
Epoch 19/200
 - 1s - loss: 5.8375 - accuracy: 0.0638
Ep

Epoch 155/200
 - 1s - loss: 0.7880 - accuracy: 0.8413
Epoch 156/200
 - 1s - loss: 0.7791 - accuracy: 0.8388
Epoch 157/200
 - 1s - loss: 0.7699 - accuracy: 0.8428
Epoch 158/200
 - 1s - loss: 0.7618 - accuracy: 0.8428
Epoch 159/200
 - 1s - loss: 0.7531 - accuracy: 0.8462
Epoch 160/200
 - 1s - loss: 0.7439 - accuracy: 0.8477
Epoch 161/200
 - 1s - loss: 0.7347 - accuracy: 0.8460
Epoch 162/200
 - 1s - loss: 0.7270 - accuracy: 0.8495
Epoch 163/200
 - 1s - loss: 0.7188 - accuracy: 0.8500
Epoch 164/200
 - 1s - loss: 0.7107 - accuracy: 0.8511
Epoch 165/200
 - 1s - loss: 0.7020 - accuracy: 0.8522
Epoch 166/200
 - 1s - loss: 0.6952 - accuracy: 0.8524
Epoch 167/200
 - 1s - loss: 0.6880 - accuracy: 0.8549
Epoch 168/200
 - 1s - loss: 0.6797 - accuracy: 0.8551
Epoch 169/200
 - 1s - loss: 0.6729 - accuracy: 0.8562
Epoch 170/200
 - 1s - loss: 0.6645 - accuracy: 0.8582
Epoch 171/200
 - 1s - loss: 0.6591 - accuracy: 0.8593
Epoch 172/200
 - 1s - loss: 0.6523 - accuracy: 0.8589
Epoch 173/200
 - 1s - loss: 

## 3. 모델 검증

In [29]:
def sentence_generation(model, t, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word # 처음 들어온 단어도 마지막에 같이 출력하기위해 저장
    sentence = ''
    for _ in range(n): # n번 반복
        encoded = t.texts_to_sequences([current_word])[0] # 현재 단어에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=15, padding='pre') # 데이터에 대한 패딩
        result = model.predict_classes(encoded, verbose=0)
          # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        for word, index in t.word_index.items(): 
            if index == result: # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
                break # 해당 단어가 예측 단어이므로 break
        current_word = current_word + ' '  + word # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        sentence = sentence + ' ' + word # 예측 단어를 문장에 저장

    sentence = init_word + sentence
    return sentence

In [50]:
print(sentence_generation(model, t, 'i', 10))
pre_se=[]
pre_se.append(sentence_generation(model, t, 'i', 10))
#임의의 단어 'i'에 대해서 10개의 단어를 추가 생성

i accidentally killed a child may i contact the family idea


In [55]:
df = pd.DataFrame(data = [real_se[8],pre_se], index=['real', 'predict'])

In [56]:
df

Unnamed: 0,0
real,i accidentally killed a child may i contact th...
predict,i accidentally killed a child may i contact th...


In [59]:
real_se, pre_se

([['i accidentally'],
  ['i accidentally killed'],
  ['i accidentally killed a'],
  ['i accidentally killed a child'],
  ['i accidentally killed a child may'],
  ['i accidentally killed a child may i'],
  ['i accidentally killed a child may i contact'],
  ['i accidentally killed a child may i contact the'],
  ['i accidentally killed a child may i contact the family']],
 ['i accidentally killed a child may i contact the family idea'])