In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('reviews_Cell_Phones_and_Accessories.csv')
df = df.dropna()

### 토큰화

In [2]:
import tensorflow as tf

In [3]:
tk = tf.keras.preprocessing.text.Tokenizer(num_words=3000, oov_token='<unk>')   

In [4]:
tk.fit_on_texts(df['review']) 

In [7]:
import joblib
joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

In [8]:
tk = joblib.load('tokenizer.pkl')

### 언어 모형에 맞게 데이터 정리

In [9]:
seqs = tk.texts_to_sequences(df['review'])

In [13]:
seq = seqs[0]

In [15]:
data = []
for seq in seqs:
    for i in range(0, len(seq) - 4):
        data.append((seq[i:i+4], seq[i+4]))

In [17]:
import random

In [18]:
random.shuffle(data)

In [20]:
xs = np.array([x for x,y in data])
ys = np.array([y for x,y in data])

In [23]:
joblib.dump((xs,ys), 'lm-data.pkl')

['lm-data.pkl']

In [24]:
xs, ys = joblib.load('lm-data.pkl')

### 언어 모형

In [25]:
NUM_WORD = tk.num_words + 1   

In [26]:
emb1 = tf.keras.layers.Embedding(input_dim=NUM_WORD, output_dim=8) 

lm = tf.keras.Sequential([
    emb1,
    tf.keras.layers.GlobalAveragePooling1D(),  
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(NUM_WORD)
])

In [27]:
lm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           24008     
_________________________________________________________________
global_average_pooling1d (Gl (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 3001)              27009     
Total params: 51,089
Trainable params: 51,089
Non-trainable params: 0
_________________________________________________________________


In [28]:
lm.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

In [29]:
lm.fit(xs, ys, epochs=1)

Train on 17354034 samples


<tensorflow.python.keras.callbacks.History at 0x14de7dbcbc8>

In [30]:
lm.save('lm.krs')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: lm.krs\assets


### embedding 확인

In [31]:
e = emb1.embeddings.numpy()

In [33]:
w = emb1.get_weights()[0] 

In [34]:
np.array_equal(e, w)

True

In [35]:
np.savez('word-emb.npz', emb=e)

In [36]:
lm = tf.keras.models.load_model('lm.krs')

### 다음에 나올 단어의 확률 예측

In [37]:
x = xs[0]
y = ys[0]

In [40]:
[tk.index_word[i] for i in x]

['install', 'and', 'use', 'all']

- 모형에 넣기.

In [42]:
logit = lm.predict(xs[0:1].astype('float32'))

In [43]:
p = tf.nn.softmax(logit).numpy()

In [44]:
p[0, 38]

0.0017933127

- 가장 높은 확률의 단어

In [45]:
p.argmax()

2

In [46]:
p[0, 2] 

0.12759411

In [47]:
tk.index_word[2]

'the'

### 전이학습

In [48]:
seqs = tk.texts_to_sequences(df['review']) 

In [51]:
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)  

### 임베딩 불러오기

In [53]:
import numpy as np

z = np.load('word-emb.npz')
e = z['emb']

### 텍스트로 감성분석 하기

In [55]:
emb2 = tf.keras.layers.Embedding(input_dim=tk.num_words+1, output_dim=8,
                             embeddings_initializer=tf.keras.initializers.Constant(e))
   

model = tf.keras.Sequential([
    emb2,
    tf.keras.layers.GlobalAveragePooling1D(), 
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid') 
])

In [56]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           24008     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 24,089
Trainable params: 24,089
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['acc']
)

In [58]:
y = df['sentiment'].values

In [59]:
model.fit(pads,y)

Train on 194340 samples


<tensorflow.python.keras.callbacks.History at 0x14d0adfa748>