### 데이터 받기

In [None]:
import requests

res = requests.get('https://github.com/euphoris/datasets/raw/master/imdb.zip')

with open('imdb.zip', 'wb') as f:
  f.write(res.content)

In [None]:
import pandas as pd

df = pd.read_csv('imdb.zip')
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


### 토큰화

In [None]:
import tensorflow as tf

tk = tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

In [None]:
tk.fit_on_texts(df['review'])

In [None]:
tk.word_index

{'<unk>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'this': 7,
 'i': 8,
 'it': 9,
 'to': 10,
 'in': 11,
 'was': 12,
 'movie': 13,
 'film': 14,
 'that': 15,
 'for': 16,
 'as': 17,
 'but': 18,
 'with': 19,
 'one': 20,
 'on': 21,
 'you': 22,
 'are': 23,
 'not': 24,
 'bad': 25,
 "it's": 26,
 'very': 27,
 'all': 28,
 'just': 29,
 'so': 30,
 'good': 31,
 'at': 32,
 'an': 33,
 'be': 34,
 'there': 35,
 'about': 36,
 'have': 37,
 'by': 38,
 'like': 39,
 'from': 40,
 'if': 41,
 'acting': 42,
 'time': 43,
 'out': 44,
 'his': 45,
 'or': 46,
 'really': 47,
 'great': 48,
 'even': 49,
 'he': 50,
 'who': 51,
 'were': 52,
 'has': 53,
 'see': 54,
 'my': 55,
 'characters': 56,
 'well': 57,
 'most': 58,
 'how': 59,
 'more': 60,
 'no': 61,
 'only': 62,
 'when': 63,
 'ever': 64,
 '10': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 'they': 71,
 'best': 72,
 'because': 73,
 'your': 74,
 'can': 75,
 'also': 76,
 "don't": 77,
 'films': 78,
 'than': 79,
 'its': 80,
 'scrip

In [None]:
tk.word_index['good']

31

In [None]:
tk.index_word[31]

'good'

In [None]:
import joblib

joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

### 전처리

In [None]:
import pandas as pd

df = pd.read_csv('imdb.zip')

In [None]:
import joblib

tk = joblib.load('tokenizer.pkl')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
seqs = tk.texts_to_sequences(df['review'])
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [None]:
tk.index_word[1]

'<unk>'

In [None]:
review0 = ''

for seq in seqs[0]:
  review0 += tk.index_word[seq] + ' '

print(review0, len(seqs[0]), sep='\n')

a very very very slow moving aimless movie about a distressed drifting young man 
14


In [None]:
tk.index_word[4], tk.index_word[27], tk.index_word[287]

('a', 'very', 'slow')

In [None]:
seq = seqs[0]

In [None]:
list(range(0, len(seq) - 4)) # 해당 문장에는 10개의 5-gram이 나온다

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
data = []

for seq in seqs:
  print("**", seq, "**")
  for i in range(0, len(seq) - 4):
    print(seq[i:i+4], seq[i+4])
  print("=" * 40)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[162, 3, 22, 47] 385
[3, 22, 47, 385] 16
[22, 47, 385, 16] 117
** [33, 1, 313, 19, 4, 48, 478, 3, 4, 1, 411, 450, 2, 179, 1] **
[33, 1, 313, 19] 4
[1, 313, 19, 4] 48
[313, 19, 4, 48] 478
[19, 4, 48, 478] 3
[4, 48, 478, 3] 4
[48, 478, 3, 4] 1
[478, 3, 4, 1] 411
[3, 4, 1, 411] 450
[4, 1, 411, 450] 2
[1, 411, 450, 2] 179
[411, 450, 2, 179] 1
** [77, 1, 9] **
** [7, 6, 4, 955, 198, 5, 14, 295, 19, 141, 849, 1, 3, 444, 1, 112, 11, 7, 1, 3, 28, 1, 5, 2, 1, 1, 275, 604, 1, 134, 15, 1, 2, 618, 1, 5, 1, 3, 2, 1, 1, 5, 4, 1, 1, 1, 2, 1, 205, 1, 1, 3, 87, 1116, 10, 143, 3, 273, 1117, 1, 1, 1, 1, 1, 3, 1, 1117, 1, 3, 275, 1, 3, 60] **
[7, 6, 4, 955] 198
[6, 4, 955, 198] 5
[4, 955, 198, 5] 14
[955, 198, 5, 14] 295
[198, 5, 14, 295] 19
[5, 14, 295, 19] 141
[14, 295, 19, 141] 849
[295, 19, 141, 849] 1
[19, 141, 849, 1] 3
[141, 849, 1, 3] 444
[849, 1, 3, 444] 1
[1, 3, 444, 1] 112
[3, 444, 1, 112] 11
[444, 1, 112, 11] 7
[1, 112, 11, 7] 1
[112, 11, 7, 1]

In [None]:
data = []

for seq in seqs:
  for i in range(0, len(seq) - 4):
    data.append((seq[i:i+4], seq[i+4]))

data

[([4, 27, 27, 27], 287),
 ([27, 27, 27, 287], 407),
 ([27, 27, 287, 407], 1217),
 ([27, 287, 407, 1217], 13),
 ([287, 407, 1217, 13], 36),
 ([407, 1217, 13, 36], 4),
 ([1217, 13, 36, 4], 1218),
 ([13, 36, 4, 1218], 1219),
 ([36, 4, 1218, 1219], 408),
 ([4, 1218, 1219, 408], 142),
 ([24, 522, 51, 12], 60),
 ([522, 51, 12, 60], 409),
 ([51, 12, 60, 409], 2),
 ([12, 60, 409, 2], 736),
 ([60, 409, 2, 736], 56),
 ([409, 2, 736, 56], 46),
 ([2, 736, 56, 46], 2),
 ([736, 56, 46, 2], 337),
 ([56, 46, 2, 337], 1220),
 ([46, 2, 337, 1220], 288),
 ([2, 337, 1220, 288], 5),
 ([337, 1220, 288, 5], 737),
 ([1220, 288, 5, 737], 738),
 ([288, 5, 737, 738], 44),
 ([1221, 1222, 19, 209], 233),
 ([1222, 19, 209, 233], 3),
 ([19, 209, 233, 3], 338),
 ([209, 233, 3, 338], 184),
 ([233, 3, 338, 184], 739),
 ([3, 338, 184, 739], 2),
 ([338, 184, 739, 2], 13),
 ([184, 739, 2, 13], 289),
 ([739, 2, 13, 289], 740),
 ([2, 13, 289, 740], 49),
 ([13, 289, 740, 49], 60),
 ([289, 740, 49, 60], 339),
 ([740, 49, 60, 

In [None]:
import random

random.shuffle(data)
data[0]

([145, 351, 755, 16], 234)

In [None]:
import numpy as np

xs = np.array([x for x, y in data])
ys = np.array([y for x, y in data])

In [None]:
xs

array([[145, 351, 755,  16],
       [444,   1, 118, 846],
       [ 14,  10,  54,  41],
       ...,
       [ 30,   8, 231, 127],
       [252,  29, 986,   9],
       [ 28,  11,  28,   4]])

In [None]:
ys

array([ 234, 1118,   35, ...,   10,   21,  169])

In [None]:
joblib.dump((xs, ys), 'lm-data.pkl')

['lm-data.pkl']

### 학습

In [None]:
import joblib

tk = joblib.load('tokenizer.pkl')
xs, ys = joblib.load('lm-data.pkl')

In [None]:
import tensorflow as tf

NUM_WORD =  tk.num_words + 1 # 0은 패딩에 대한 예약어라서 + 1

In [None]:
tk.num_words

2000

In [None]:
tk.index_word

{1: '<unk>',
 2: 'the',
 3: 'and',
 4: 'a',
 5: 'of',
 6: 'is',
 7: 'this',
 8: 'i',
 9: 'it',
 10: 'to',
 11: 'in',
 12: 'was',
 13: 'movie',
 14: 'film',
 15: 'that',
 16: 'for',
 17: 'as',
 18: 'but',
 19: 'with',
 20: 'one',
 21: 'on',
 22: 'you',
 23: 'are',
 24: 'not',
 25: 'bad',
 26: "it's",
 27: 'very',
 28: 'all',
 29: 'just',
 30: 'so',
 31: 'good',
 32: 'at',
 33: 'an',
 34: 'be',
 35: 'there',
 36: 'about',
 37: 'have',
 38: 'by',
 39: 'like',
 40: 'from',
 41: 'if',
 42: 'acting',
 43: 'time',
 44: 'out',
 45: 'his',
 46: 'or',
 47: 'really',
 48: 'great',
 49: 'even',
 50: 'he',
 51: 'who',
 52: 'were',
 53: 'has',
 54: 'see',
 55: 'my',
 56: 'characters',
 57: 'well',
 58: 'most',
 59: 'how',
 60: 'more',
 61: 'no',
 62: 'only',
 63: 'when',
 64: 'ever',
 65: '10',
 66: 'movies',
 67: 'plot',
 68: 'story',
 69: 'made',
 70: 'some',
 71: 'they',
 72: 'best',
 73: 'because',
 74: 'your',
 75: 'can',
 76: 'also',
 77: "don't",
 78: 'films',
 79: 'than',
 80: 'its',
 81: 's

In [None]:
xs[0]

array([145, 351, 755,  16])

In [None]:
NUM_WORD

2001

### 단어 임베딩

In [None]:
emb1 = tf.keras.layers.Embedding(
    input_dim=NUM_WORD,
    output_dim=8, # 단어 하나당 예측된 페어 8개가 나온다
)

In [None]:
lm = tf.keras.Sequential([ # 4단어 * 8 = 32 벡터
    emb1, # emb1에서 출력한 32벡터를 각 단어의 벡터 위치별로 평균을 냄: 출력은 8벡터
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(NUM_WORD) # 입력 4단어에 대한 출력 값은 2001(NUM_WORD)개의 벡터 값으로 나옴. 방법 1
    # tf.keras.layers.Dense(NUM_WORD, activation='softmax') # 방법 2-1. 그냥 돌리면 shape 안 맞아서 에러남!
    # tf.keras.layers.Dense(NUM_WORD-3, activation='softmax') # 방법 2-2
])

In [None]:
lm.summary()
# embedding (None, None, 8)
# 첫번째 None은 데이터 건수, 두번째 None은 단어의 개수(4개)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           16008     
                                                                 
 global_average_pooling1d (G  (None, 8)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 2001)              18009     
                                                                 
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [None]:
# tf.keras.layers.Dense(8, activation='softmax')를 생략한 이유
# 방법 1. 손실함수에서 적용(compile 설정 시 아래처럼 from_logits=True 값 설정)
lm.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # SparseCategoricalCrossentropy: y 값 one-hot 안해도 됨
    optimizer='adam',
    metrics=['accuracy']
)

# 방법 2. 위에서 처럼 Dense에서 적용
# lm.compile(
#     loss=tf.keras.losses.CategoricalCrossentropy(),
#     optimizer='adam',
#     metrics=['accuracy']
# )

# from tensorflow.keras.utils import to_categorical

# ys_onehot = to_categorical(ys)

# ys_onehot.shape

In [None]:
# 방법 1
lm.fit(xs, ys, epochs=1)

# 방법 2
# lm.fit(xs, ys_onehot, epochs=1) # ValueError: Shapes (None, 1) and (None, 2001) are incompatible



<keras.callbacks.History at 0x7fd69e317f90>

In [None]:
lm.save('lm.krs')

### 단어 임베딩

In [None]:
emb1

<keras.layers.core.embedding.Embedding at 0x7fd69df14ad0>

In [None]:
emb1.embeddings

<tf.Variable 'embedding/embeddings:0' shape=(2001, 8) dtype=float32, numpy=
array([[-2.3767710e-02, -2.3438919e-02, -2.4999380e-03, ...,
        -3.2004155e-02,  4.0671099e-02,  1.4002923e-02],
       [ 3.3438158e-01, -4.1123575e-01, -3.8758749e-01, ...,
         3.5206226e-01,  3.1876928e-01, -4.4302851e-01],
       [ 3.4696090e-01, -3.3030328e-01, -3.7762007e-01, ...,
         2.7905491e-01,  2.4863632e-01, -3.1959009e-01],
       ...,
       [ 6.9497831e-02, -2.6985377e-02, -4.8094500e-02, ...,
         3.1417178e-03,  5.4674633e-02, -7.2542459e-02],
       [ 5.8878347e-02, -7.6967023e-02, -5.2758589e-02, ...,
         1.1407813e-02,  3.0728444e-02, -1.5243010e-02],
       [-3.5904419e-02,  4.0932860e-02, -3.7242100e-04, ...,
        -3.3034638e-02, -1.4209997e-02,  2.2455160e-02]], dtype=float32)>

In [None]:
e = emb1.embeddings.numpy()
e

array([[-2.3767710e-02, -2.3438919e-02, -2.4999380e-03, ...,
        -3.2004155e-02,  4.0671099e-02,  1.4002923e-02],
       [ 3.3438158e-01, -4.1123575e-01, -3.8758749e-01, ...,
         3.5206226e-01,  3.1876928e-01, -4.4302851e-01],
       [ 3.4696090e-01, -3.3030328e-01, -3.7762007e-01, ...,
         2.7905491e-01,  2.4863632e-01, -3.1959009e-01],
       ...,
       [ 6.9497831e-02, -2.6985377e-02, -4.8094500e-02, ...,
         3.1417178e-03,  5.4674633e-02, -7.2542459e-02],
       [ 5.8878347e-02, -7.6967023e-02, -5.2758589e-02, ...,
         1.1407813e-02,  3.0728444e-02, -1.5243010e-02],
       [-3.5904419e-02,  4.0932860e-02, -3.7242100e-04, ...,
        -3.3034638e-02, -1.4209997e-02,  2.2455160e-02]], dtype=float32)

In [None]:
e.shape

(2001, 8)

In [None]:
emb1.get_weights()

[array([[-2.3767710e-02, -2.3438919e-02, -2.4999380e-03, ...,
         -3.2004155e-02,  4.0671099e-02,  1.4002923e-02],
        [ 3.3438158e-01, -4.1123575e-01, -3.8758749e-01, ...,
          3.5206226e-01,  3.1876928e-01, -4.4302851e-01],
        [ 3.4696090e-01, -3.3030328e-01, -3.7762007e-01, ...,
          2.7905491e-01,  2.4863632e-01, -3.1959009e-01],
        ...,
        [ 6.9497831e-02, -2.6985377e-02, -4.8094500e-02, ...,
          3.1417178e-03,  5.4674633e-02, -7.2542459e-02],
        [ 5.8878347e-02, -7.6967023e-02, -5.2758589e-02, ...,
          1.1407813e-02,  3.0728444e-02, -1.5243010e-02],
        [-3.5904419e-02,  4.0932860e-02, -3.7242100e-04, ...,
         -3.3034638e-02, -1.4209997e-02,  2.2455160e-02]], dtype=float32)]

In [None]:
w = emb1.get_weights()[0]
w

array([[-2.3767710e-02, -2.3438919e-02, -2.4999380e-03, ...,
        -3.2004155e-02,  4.0671099e-02,  1.4002923e-02],
       [ 3.3438158e-01, -4.1123575e-01, -3.8758749e-01, ...,
         3.5206226e-01,  3.1876928e-01, -4.4302851e-01],
       [ 3.4696090e-01, -3.3030328e-01, -3.7762007e-01, ...,
         2.7905491e-01,  2.4863632e-01, -3.1959009e-01],
       ...,
       [ 6.9497831e-02, -2.6985377e-02, -4.8094500e-02, ...,
         3.1417178e-03,  5.4674633e-02, -7.2542459e-02],
       [ 5.8878347e-02, -7.6967023e-02, -5.2758589e-02, ...,
         1.1407813e-02,  3.0728444e-02, -1.5243010e-02],
       [-3.5904419e-02,  4.0932860e-02, -3.7242100e-04, ...,
        -3.3034638e-02, -1.4209997e-02,  2.2455160e-02]], dtype=float32)

In [None]:
np.array_equal(e, w)

True

In [None]:
np.savez('word_emb.npz', emb=e)

### GlobalAveragePooling1D

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
x = np.array([[[1., 2., 3.],
               [3., 6., 9.]]], dtype=float)
x

array([[[1., 2., 3.],
        [3., 6., 9.]]])

In [None]:
x.shape

(1, 2, 3)

In [None]:
avg = tf.keras.layers.GlobalAveragePooling1D()
avg

<keras.layers.pooling.global_average_pooling1d.GlobalAveragePooling1D at 0x7fd698153a50>

In [None]:
y = avg(x).numpy()
y

array([[2., 4., 6.]], dtype=float32)

### 다음 토큰 확률 예측

In [None]:
import joblib

tk = joblib.load('tokenizer.pkl')

In [None]:
xs, ys = joblib.load('lm-data.pkl')

In [None]:
import tensorflow as tf

lm = tf.keras.models.load_model('lm.krs')

In [None]:
x = xs[2:3]
y = ys[2]

In [None]:
x

array([[14, 10, 54, 41]])

In [None]:
y

35

In [None]:
[tk.index_word[i] for i in x[0]]

['film', 'to', 'see', 'if']

In [None]:
tk.index_word[5]

'of'

In [None]:
import numpy as np

logit = lm.predict(x.astype(float))
logit.shape



(1, 2001)

In [None]:
logit, len(logit[0])

(array([[-3.9640586,  4.072164 ,  3.3809114, ..., -3.9863973, -4.093557 ,
         -3.943457 ]], dtype=float32), 2001)

In [None]:
p = tf.nn.softmax(logit).numpy()
p

array([[3.2887685e-05, 1.0165303e-01, 5.0922897e-02, ..., 3.2161155e-05,
        2.8893011e-05, 3.3572236e-05]], dtype=float32)

In [None]:
p.sum()

1.0000001

In [None]:
p[0, 35]

0.0020008467

In [None]:
i = p.argmax()
i

1

In [None]:
p[0, i]

0.10165303

In [None]:
tk.index_word[i]

'<unk>'

### 전이 학습

In [None]:
import pandas as pd

df = pd.read_csv('imdb.zip')
df

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
995,I just got bored watching Jessice Lange take h...,0
996,"Unfortunately, any virtue in this film's produ...",0
997,"In a word, it is embarrassing.",0
998,Exceptionally bad!,0


In [None]:
import joblib

tk = joblib.load('tokenizer.pkl')

In [None]:
seqs = tk.texts_to_sequences(df['review'])

In [None]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [None]:
seqs[1]

[24, 522, 51, 12, 60, 409, 2, 736, 56, 46, 2, 337, 1220, 288, 5, 737, 738, 44]

In [None]:
import tensorflow as tf

pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)

In [None]:
print(pads[0], len(pads[0]), max([len(seq) for seq in seqs]), sep='\n')

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    4   27   27   27  287  407 1217   13   36    4 1218
 1219  408  142]
73
73


### 단어 임베딩 불러오기

In [None]:
import numpy as np

z = np.load('word-emb.npz')
e = z['emb']
e

### 감성분석

In [None]:
emb2 = tf.keras.layers.Embedding(
    input_dim = tk.num_words + 1,
    output_dim = 8,
    embeddings_initializer = tf.keras.initializers.Constant(e) # 재학습하는 것이 아니라, 기존의 emb1을 초기화 해서 그대로 가져옮
)

In [None]:
model = tf.keras.Sequential([
    emb2,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 8)           16008     
                                                                 
 global_average_pooling1d_3   (None, 8)                0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 8)                 72        
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 16,089
Trainable params: 16,089
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
y = df['sentiment'].values

In [None]:
model.fit(pads, y)



<keras.callbacks.History at 0x7fd69980aad0>