In [71]:
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [72]:
# open the file widiow.txt and read it 
with open('widow.txt', 'r') as file:
    text = file.read().replace('\n', '')


In [None]:
# tokenize the text into sentences
sentences = sent_tokenize(text)
sentences

In [74]:
normalizedText = [] 
for sent in sentences: 
    tokens = re.sub("[^a-z0-9]+"," ",sent.lower())
    normalizedText.append(tokens)
result = [ word_tokenize(s) for s in normalizedText ] 

In [75]:
from gensim.models import Word2Vec

In [76]:
model = Word2Vec(sentences=result , vector_size = 180 , window = 4, min_count=2 , workers = 4 , sg = 1)

In [77]:
model.wv.most_similar('widow', topn=10)

[('women', 0.2551659345626831),
 ('burden', 0.24160641431808472),
 ('against', 0.23628844320774078),
 ('spouses', 0.20751942694187164),
 ('the', 0.20169739425182343),
 ('not', 0.1904909461736679),
 ('by', 0.19035615026950836),
 ('another', 0.18514233827590942),
 ('be', 0.18389874696731567),
 ('of', 0.18334296345710754)]

In [78]:
model.wv.save_word2vec_format('widow2')

In [79]:
from gensim.models.keyedvectors import KeyedVectors

In [80]:
loaded_model = KeyedVectors.load_word2vec_format('widow2')

In [81]:
print('모델의 크기(shape) :',loaded_model.vectors.shape)
# 145의 차원을 가진 word2vec 벡터가 180개 있다.

모델의 크기(shape) : (145, 180)


In [159]:
sentences = [
    'The state of having lost ones spouse to death is termed widowhood',
    'a woman who has lost her spouse by death and has not remarried.',
    'widow is burden',
    'spouses means persons wife',
    'spouses means a husband or wife, considered in relation to their partner.',
    'My today has been really tough, but I am seeing it as another chance to focus on healing and moving forward.',
    'I am ending with a few chapters on a book and some journaling to process my thoughts.',
    'Throughout the day I found a few moments to check in with my friends and family, knowing that their love and company are essential for my healing process.',
]
y_train = [1,1,1,1,1,1,1,1,]

In [161]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1 # 패딩을 고려하여 +1
print('단어 집합 :',vocab_size)

단어 집합 : 76


In [170]:

X_encoded = tokenizer.texts_to_sequences(sentences)
print('정수 인코딩 결과 :',X_encoded)
max_len = max(len(l) for l in X_encoded)
print('최대 길이 :',max_len)
X_train = pad_sequences(X_encoded, maxlen= max_len, padding='post')
y_train = np.array(y_train)
print('패딩 결과 :')
print(X_train)
print(y_train)

정수 인코딩 결과 : [[7, 23, 24, 25, 8, 26, 9, 1, 10, 11, 27, 28], [2, 29, 30, 5, 8, 31, 9, 32, 10, 3, 5, 33, 34], [35, 11, 36], [12, 13, 37, 14], [12, 13, 2, 38, 39, 14, 40, 15, 41, 1, 16, 42], [4, 43, 5, 44, 45, 46, 47, 6, 17, 48, 49, 50, 51, 52, 1, 53, 18, 19, 3, 54, 55], [6, 17, 56, 20, 2, 21, 57, 18, 2, 58, 3, 59, 60, 1, 22, 4, 61], [62, 7, 63, 6, 64, 2, 21, 65, 1, 66, 15, 20, 4, 67, 3, 68, 69, 70, 16, 71, 3, 72, 73, 74, 75, 4, 19, 22]]
최대 길이 : 28
패딩 결과 :
[[ 7 23 24 25  8 26  9  1 10 11 27 28  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [ 2 29 30  5  8 31  9 32 10  3  5 33 34  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [35 11 36  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [12 13 37 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [12 13  2 38 39 14 40 15 41  1 16 42  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]
 [ 4 43  5 44 45 46 47  6 17 48 49 50 51 52  1 53 18 19  3 54 55  0  0  0
   0  0  0  0]
 [ 6 17 5

In [171]:
embedding_matrix = np.zeros((vocab_size, 180))
def get_vector(word):
    if word in loaded_model:
        return loaded_model[word]
    else:
        return None

In [172]:
for word, index in tokenizer.word_index.items():
    # 단어와 맵핑되는 사전 훈련된 임베딩 벡터값
    vector_value = get_vector(word)
    if vector_value is not None:
        embedding_matrix[index] = vector_value

In [175]:
print(loaded_model['woman'])

[-2.5543014e-03  5.0518531e-03  2.6298468e-03 -2.2227594e-03
 -9.3628105e-04 -1.4578261e-03  4.1421140e-03  3.0400320e-03
 -4.8831264e-03  1.5014106e-03  3.4770013e-03 -1.1652287e-03
 -1.6344019e-03  2.2051195e-03 -2.9978456e-03 -1.2554310e-03
  3.2921962e-03 -3.5982742e-03 -6.8841311e-03  8.2824141e-04
 -3.1833842e-03 -3.5017198e-03  5.9437947e-03  3.0079756e-03
  5.2983882e-03 -7.1264724e-03 -1.2695376e-03 -4.5329980e-03
  1.7685987e-04 -5.2175559e-03  4.4770483e-03  8.2440639e-04
 -6.4092450e-04 -5.5728399e-04  6.2878910e-03 -2.2451553e-05
 -2.0939508e-03 -4.6842378e-03  2.0122917e-03  6.1489139e-03
  2.4584220e-03 -2.4827456e-03  1.1989875e-03 -1.2798873e-03
 -5.5495868e-03 -1.8700454e-03  3.4627496e-04  2.9323173e-03
 -2.9523307e-03  1.8967528e-04 -2.8107308e-03 -8.6703961e-04
 -9.4280578e-04  3.4729799e-04 -6.0255639e-04 -3.5824403e-03
  1.5913261e-03 -5.8365376e-03  3.8421177e-03 -6.0166125e-03
 -6.1211004e-03 -1.9964369e-03  2.8728852e-03  1.0633443e-03
 -4.6137138e-03 -3.78571

In [182]:
print('단어 women의 맵핑된 정수 :', tokenizer.word_index['woman'])

단어 women의 맵핑된 정수 : 29


In [183]:
print('단어 woman의 임베딩 벡터값: ', embedding_matrix[29])

단어 woman의 임베딩 벡터값:  [-2.55430141e-03  5.05185314e-03  2.62984680e-03 -2.22275942e-03
 -9.36281052e-04 -1.45782612e-03  4.14211396e-03  3.04003200e-03
 -4.88312636e-03  1.50141062e-03  3.47700133e-03 -1.16522866e-03
 -1.63440185e-03  2.20511947e-03 -2.99784564e-03 -1.25543098e-03
  3.29219620e-03 -3.59827420e-03 -6.88413111e-03  8.28241406e-04
 -3.18338419e-03 -3.50171980e-03  5.94379473e-03  3.00797564e-03
  5.29838819e-03 -7.12647242e-03 -1.26953761e-03 -4.53299796e-03
  1.76859874e-04 -5.21755591e-03  4.47704829e-03  8.24406394e-04
 -6.40924496e-04 -5.57283987e-04  6.28789095e-03 -2.24515534e-05
 -2.09395075e-03 -4.68423776e-03  2.01229169e-03  6.14891388e-03
  2.45842198e-03 -2.48274556e-03  1.19898748e-03 -1.27988728e-03
 -5.54958684e-03 -1.87004544e-03  3.46274959e-04  2.93231732e-03
 -2.95233075e-03  1.89675280e-04 -2.81073083e-03 -8.67039606e-04
 -9.42805782e-04  3.47297988e-04 -6.02556393e-04 -3.58244032e-03
  1.59132609e-03 -5.83653757e-03  3.84211773e-03 -6.01661252e-03
 -6.1

In [184]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

model = Sequential()
e = Embedding(vocab_size, 180, weights=[embedding_matrix], input_length=max_len)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6930 - acc: 0.3750 - 323ms/epoch - 323ms/step
Epoch 2/100
1/1 - 0s - loss: 0.6694 - acc: 1.0000 - 14ms/epoch - 14ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6451 - acc: 1.0000 - 16ms/epoch - 16ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6191 - acc: 1.0000 - 17ms/epoch - 17ms/step
Epoch 5/100
1/1 - 0s - loss: 0.5913 - acc: 1.0000 - 22ms/epoch - 22ms/step
Epoch 6/100
1/1 - 0s - loss: 0.5617 - acc: 1.0000 - 15ms/epoch - 15ms/step
Epoch 7/100
1/1 - 0s - loss: 0.5303 - acc: 1.0000 - 12ms/epoch - 12ms/step
Epoch 8/100
1/1 - 0s - loss: 0.4973 - acc: 1.0000 - 12ms/epoch - 12ms/step
Epoch 9/100
1/1 - 0s - loss: 0.4631 - acc: 1.0000 - 12ms/epoch - 12ms/step
Epoch 10/100
1/1 - 0s - loss: 0.4279 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 11/100
1/1 - 0s - loss: 0.3923 - acc: 1.0000 - 9ms/epoch - 9ms/step
Epoch 12/100
1/1 - 0s - loss: 0.3566 - acc: 1.0000 - 12ms/epoch - 12ms/step
Epoch 13/100
1/1 - 0s - loss: 0.3214 - acc: 1.0000 - 10ms/epoch - 10ms/step
Epoch 14/100
1/1 - 0s -

<keras.callbacks.History at 0x7fe9907cfeb0>