In [70]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
from string import punctuation

In [2]:
texts=['자연어 처리 알고리즘', '자연어 처리 방법',
       '자연어 NLP 알고리즘 알고리즘',
       '자연어 처리 전문가']

In [3]:
tok = Tokenizer()
tok.fit_on_texts(texts)

In [4]:
tok.index_word

{1: '자연어', 2: '처리', 3: '알고리즘', 4: '방법', 5: 'nlp', 6: '전문가'}

In [5]:
tok.texts_to_matrix(texts) # default : mode='binary'

array([[0., 1., 1., 1., 0., 0., 0.],
       [0., 1., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 1.]])

In [6]:
tok.texts_to_matrix(texts, mode='tfidf')

array([[0.        , 0.58778666, 0.69314718, 0.84729786, 0.        ,
        0.        , 0.        ],
       [0.        , 0.58778666, 0.69314718, 0.        , 1.09861229,
        0.        , 0.        ],
       [0.        , 0.58778666, 0.        , 1.43459998, 0.        ,
        1.09861229, 0.        ],
       [0.        , 0.58778666, 0.69314718, 0.        , 0.        ,
        0.        , 1.09861229]])

### 다음 단어 예측

In [7]:
df = pd.read_csv('../Data/NYT_2018.csv')
df

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1319,5ae82c93068401528a2ab969,1004,By CLAIRE CAIN MILLER,article,This Common Question Reinforces the Gender Pay...,"['Discrimination', 'Wages and Salaries', 'Labo...",68,Upshot,3,2018-05-01 09:00:01,Unknown,Several states and cities have ordered employe...,The New York Times,News,https://www.nytimes.com/2018/05/01/upshot/how-...
1320,5ae82c95068401528a2ab96b,1043,By TRACY J. GATES,article,"Anna, Llama and Me","['Friendship', 'Dewdney, Anna', 'Writing and W...",65,Well,0,2018-05-01 09:00:02,Family,"The beginning, middle and end of a picture boo...",The New York Times,News,https://www.nytimes.com/2018/05/01/well/family...
1321,5ae82c9d068401528a2ab96d,659,Interview by AUDIE CORNISH,article,Gen. Michael Hayden Has One Regret: Russia,"['Classified Information and State Secrets', '...",66,Magazine,70,2018-05-01 09:00:06,Unknown,"The former N.S.A. and C.I.A. chief on Trump, S...",The New York Times,News,https://www.nytimes.com/2018/05/01/magazine/ge...
1322,5ae82c9f068401528a2ab96f,1155,By JASON ROBERT BROWN,article,There Is Nothin’ Like a Tune,"['Books and Literature', 'Purdum, Todd S', 'Th...",68,BookReview,17,2018-05-01 09:00:07,Book Review,"In “Something Wonderful,” Todd S. Purdum analy...",The New York Times,Review,https://www.nytimes.com/2018/05/01/books/revie...


In [8]:
headline = []
headline.extend(list(df.headline.values))
headline

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?',
 'Commuter Reprogramming',
 'Unknown',
 'Unknown',
 'Ford Changed Leaders, Looking for a Lift. It’s Still Looking.',
 'Romney Failed to Win at Utah Convention, But Few Believe He’s Doomed',
 'Chain Reaction',
 'He Forced the Vatican to Investigate Sex Abuse. Now He’s Meeting With Pope Francis.',
 'In Berlin, artists find a home',
 'Unknown',
 'The Right Stuff',
 'Jimmy Carter Knows What North Korea Wants',
 'The Truth Is Out There',
 'New Jersey Ruling Could Reignite Battle Over Church-State Separation',
 'Procrastinating',
 'Word + Quiz: dilatory',
 'My Life-Threatening Bout With E. Coli Food Poisoning',
 'Choosing Brexit, a Town Yearned for Its Seafa

In [9]:
sum(df.headline=='Unknown')

110

In [10]:
headline[0]

'Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell'

In [11]:
def pre_func(title):
    res = ''.join(w.lower() for w in title if w not in punctuation)
    return res

In [12]:
pre_headline = [pre_func(x) for x in headline if x != 'Unknown']

In [13]:
pre_headline

['former nfl cheerleaders’ settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression',
 'commuter reprogramming',
 'ford changed leaders looking for a lift it’s still looking',
 'romney failed to win at utah convention but few believe he’s doomed',
 'chain reaction',
 'he forced the vatican to investigate sex abuse now he’s meeting with pope francis',
 'in berlin artists find a home',
 'the right stuff',
 'jimmy carter knows what north korea wants',
 'the truth is out there',
 'new jersey ruling could reignite battle over churchstate separation',
 'procrastinating',
 'word  quiz dilatory',
 'my lifethreatening bout with e coli food poisoning',
 'choosing brexit a town yearned for its seafaring past and muddied its future',
 'a quote disproved',
 'hot stuff turns cold',
 'at the top the pay gap may be gone',
 '

In [14]:
tok = Tokenizer()

tok.fit_on_texts(pre_headline)

In [17]:
tok.index_word

{1: 'the',
 2: 'a',
 3: 'to',
 4: 'of',
 5: 'in',
 6: 'for',
 7: 'and',
 8: 'on',
 9: 'is',
 10: 'with',
 11: 'trump',
 12: 'as',
 13: 'at',
 14: 'new',
 15: 'from',
 16: 'how',
 17: 'it',
 18: 'an',
 19: 'be',
 20: 'that',
 21: 'season',
 22: 'us',
 23: 'you',
 24: 'what',
 25: 'episode',
 26: 'can',
 27: 'your',
 28: 'he',
 29: 'not',
 30: 'his',
 31: 'are',
 32: 'teaching',
 33: 'out',
 34: 'was',
 35: 'no',
 36: 'by',
 37: 'trump’s',
 38: 'has',
 39: 'war',
 40: 'over',
 41: 'may',
 42: 'into',
 43: 'more',
 44: 'now',
 45: 'why',
 46: 'who',
 47: 'about',
 48: 'recap',
 49: 'activities',
 50: '1',
 51: 'we',
 52: 'do',
 53: 'women',
 54: 'when',
 55: 'syria',
 56: 'trade',
 57: 'it’s',
 58: '2',
 59: 'or',
 60: 'will',
 61: 'i',
 62: 'have',
 63: 'just',
 64: 'but',
 65: 'home',
 66: 'up',
 67: 'this',
 68: 'one',
 69: 'off',
 70: 'house',
 71: 'president',
 72: 'gop',
 73: 'our',
 74: 'long',
 75: 'facebook',
 76: 'case',
 77: 'life',
 78: 'end',
 79: 'some',
 80: 'power',
 81: '

In [16]:
len(tok.index_word)

3619

In [18]:
vocab_size = len(tok.index_word) + 1

In [19]:
sequences = []
for s in pre_headline:
    enc = tok.texts_to_sequences([s])[0]
    for i in range(1, len(enc)):
        seq = enc[:i+1]
        sequences.append(seq)

In [20]:
sequences[:15]

[[95, 263],
 [95, 263, 1100],
 [95, 263, 1100, 1101],
 [95, 263, 1100, 1101, 572],
 [95, 263, 1100, 1101, 572, 50],
 [95, 263, 1100, 1101, 572, 50, 7],
 [95, 263, 1100, 1101, 572, 50, 7, 2],
 [95, 263, 1100, 1101, 572, 50, 7, 2, 365],
 [95, 263, 1100, 1101, 572, 50, 7, 2, 365, 10],
 [95, 263, 1100, 1101, 572, 50, 7, 2, 365, 10, 1102],
 [96, 3],
 [96, 3, 1103],
 [96, 3, 1103, 2],
 [96, 3, 1103, 2, 14],
 [96, 3, 1103, 2, 14, 573]]

In [24]:
idx2word = {}
for k, v in tok.word_index.items():
    idx2word[v] = k

맨 오른쪽 값이 target, 나머지를 train으로 볼 수 있다.

다음 단어를 예측하는 것이므로 n:1 관계이다 따라서 n:1 RNN구조를 설계해야 한다.

맨 마지막 출력의 차원은 3620차원으로 나와야 한다 (vocab의 길이가 3620이므로)

In [102]:
max_len = max(len(i) for i in sequences)
max_len

24

In [27]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [30]:
sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   95,
        263, 1100])

In [51]:
sequences=np.array(sequences)
x=sequences[:,:-1]
y=sequences[:,-1]

In [54]:
sequences.shape

(7809, 24)

In [52]:
x.shape

(7809, 23)

In [53]:
y.shape

(7809,)

In [56]:
y

array([ 263, 1100, 1101, ...,  110,    2, 1014])

In [71]:
y = to_categorical(y, vocab_size) # 원핫인코딩

In [72]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [73]:
y.shape

(7809, 3620)

In [58]:
vocab_size

3620

24개(단어 최대개수)의 10차원짜리 임베딩된 벡터가 LSTM 셀에 들어간다
- 그대로 3620차원의 원핫인코딩된 벡터를 넣으면 너무 연산량이 많기 때문이다.   

LSTM의 출력의 차원도 결정해줘야 한다 (밑의 코드에서는 128차원)
- 차원의 차수는 특성의 개수라고 생각해도 된다   

마지막 출력인 128차원과 vocab_size가 Dense(fully connected)로 결합된다   
그 후 vocab_size의 차원에 확률이 실수로 나온다 (0.01 0.0 0.3 .... 0.6 0.1 ...)   
이것을 정답 벡터와 비교해서 (0 0 0 ... 0 1 ...) 오류를 줄이도록(최적화) 역전파로 학습시킨다

In [64]:
model = Sequential()
model.add(Embedding(vocab_size, 10)) # 3620 -> 10차원으로 임베딩
model.add(LSTM(128)) # LSTM 계층을 하나 더 만드려면 return_sequences=True로 바꿔줘야함
model.add(Dense(vocab_size, activation='softmax'))

In [65]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 10)          36200     
                                                                 
 lstm_2 (LSTM)               (None, 128)               71168     
                                                                 
 dense_1 (Dense)             (None, 3620)              466980    
                                                                 
Total params: 574348 (2.19 MB)
Trainable params: 574348 (2.19 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [75]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [76]:
model.fit(x, y, epochs=200, verbose=2)

Epoch 1/200
245/245 - 10s - loss: 7.6943 - accuracy: 0.0251 - 10s/epoch - 39ms/step
Epoch 2/200
245/245 - 7s - loss: 7.1561 - accuracy: 0.0296 - 7s/epoch - 31ms/step
Epoch 3/200
245/245 - 8s - loss: 7.0215 - accuracy: 0.0311 - 8s/epoch - 32ms/step
Epoch 4/200
245/245 - 8s - loss: 6.8986 - accuracy: 0.0394 - 8s/epoch - 32ms/step
Epoch 5/200
245/245 - 7s - loss: 6.7457 - accuracy: 0.0442 - 7s/epoch - 30ms/step
Epoch 6/200
245/245 - 7s - loss: 6.5745 - accuracy: 0.0483 - 7s/epoch - 30ms/step
Epoch 7/200
245/245 - 7s - loss: 6.3856 - accuracy: 0.0540 - 7s/epoch - 30ms/step
Epoch 8/200
245/245 - 8s - loss: 6.1843 - accuracy: 0.0553 - 8s/epoch - 31ms/step
Epoch 9/200
245/245 - 8s - loss: 5.9888 - accuracy: 0.0611 - 8s/epoch - 32ms/step
Epoch 10/200
245/245 - 8s - loss: 5.8062 - accuracy: 0.0675 - 8s/epoch - 32ms/step
Epoch 11/200
245/245 - 8s - loss: 5.6310 - accuracy: 0.0731 - 8s/epoch - 32ms/step
Epoch 12/200
245/245 - 9s - loss: 5.4720 - accuracy: 0.0773 - 9s/epoch - 36ms/step
Epoch 13/20

<keras.src.callbacks.History at 0x28787c117e0>

In [96]:
def gen_sent(model, tok, c_word, n):
    pred_sent=''
    for _ in range(n):
        enc = tok.texts_to_sequences([c_word])[0]
        enc = pad_sequences([enc], maxlen=max_len, padding='pre')
        res = model.predict(enc, verbose=False) # 3620차원
        res = res.argmax()
        for w, i in tok.word_index.items():
            if i == res:
                break
        c_word = c_word + " " + w
    pred_sent = c_word
    return pred_sent``

In [97]:
gen_sent(model, tok, 'There', 10)

'There is a middle ground on guns have you a joke'