In [1]:
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as tf_keras




In [2]:
train_dataset = tf_keras.utils.text_dataset_from_directory(r"C:\Work\2024\minion\Workspace\ml_basic\data\hide_data\train", batch_size=32)
test_dataset = tf_keras.utils.text_dataset_from_directory(r"C:\Work\2024\minion\Workspace\ml_basic\data\hide_data\test", batch_size=32)
#review_only_dataset = train_dataset.map(lambda x, y: x)
review_only_dataset = train_dataset.map(lambda review, label: review)

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [3]:
type(train_dataset)

tensorflow.python.data.ops.batch_op._BatchDataset

In [4]:
for x, y in train_dataset:
    print(x.shape, y.shape)
    print("*" * 50)
    print(x[0])
    print("*" * 50)
    print(y[0])
    
    break

(32,) (32,)
**************************************************
tf.Tensor(b"Just saw this movie, and what a waste of time. The movie was predictable and slow. It's basically the Mormon bad news bears that play church sanctioned basketball. Rather than watching this movie, I should have had a root canal. The cameo performances were obviously driven by sponsorship / funding. This movie had potential due to the outrageous behavior that is exhibited by Mormons when they play church sanctioned basketball, however because it's rated PG, the true nature of the spectacle could not be transfered to film. The acting is horrible with the exception of Clint Howard and Fred Willard. Thurl Bailey's appearance in the film was completely unnecessary.", shape=(), dtype=string)
**************************************************
tf.Tensor(0, shape=(), dtype=int32)


In [5]:
# 문장(단어집합) -> 숫자 집합 : encoding
text_vectorizer = tf_keras.layers.TextVectorization(max_tokens=100000, #사전크기, 총단어갯수
                                                    output_mode = "int",
                                                    output_sequence_length = 300) #한 문장의 단어 갯수

text_vectorizer.adapt(review_only_dataset) #단어사전 만들기 (주어진 데이터로 단어사전을 형성)





In [6]:
#변환기 테스트
for x, y in train_dataset:
    d = text_vectorizer(x) # X는 32개의 배치 x(32, 1) -> x(32, 300)
    print(d.shape)
    print(d)
    break


(32, 300)
tf.Tensor(
[[ 120   83    7 ...    0    0    0]
 [  45   11  671 ...    2 1983 6782]
 [1377  163  837 ...    0    0    0]
 ...
 [ 101  308   11 ...    0    0    0]
 [1162 3374  194 ...    0    0    0]
 [  10 1060   11 ...    0    0    0]], shape=(32, 300), dtype=int64)


In [7]:
dictionary = text_vectorizer.get_vocabulary()
print(len(dictionary))
dictionary[10:20]

100000


['i', 'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but']

In [8]:
# 문자로 되돌리기
d[0].shape
print(d[0][:10].numpy())
for t in d[0]:
    if t != 0 :
        print(dictionary[t], end= " ")

[ 120   83    7   34  412 1857 1246  521   33  682]
show people is an absolutely delightful silent directed by king vidor and starring marion davies and billy haines what gems both of them are in this charming comedy about a young girl peggy pepper whose acting is the talk of savannah trying to make it on the big screen though shes a success in comedy what she wants to do is make art so she moves up to high arts studio soon she becomes patricia pepoire and is too good for the likes of her friend [UNK] br many stars of the silent era have cameos in show people including davies herself without the curly hair and makeup im sure when people saw the film in 1928 they recognized everyone who appeared in the elaborate lunch scene sadly nowadays its not the case even for film buffs in one part of the film however she does meet charlie chaplin in another author elinor glyn is pointed out to her and vidor himself has a cameo at the end of the film other stars who pop up in show people are john g

In [9]:
#Embedding 모델 만들기 : 단어(토큰을 벡터로 만드는 모델)
input = tf_keras.layers.Input(shape=(None,))
output = tf_keras.layers.Embedding(input_dim = 100000, output_dim = 100)(input)

embedding_model = tf_keras.models.Model(inputs = input, outputs = output)

In [10]:
for review in review_only_dataset:
    #print(review)
    vectorized_reviwe = text_vectorizer(review) # 단어 1 -> 숫자 1개
    embedded_review = embedding_model(vectorized_reviwe) # 숫자 1개 -> 100개의 의미를 가진 숫자 1개
    break

In [11]:
vectorized_reviwe.shape, embedded_review.shape # 배치,  (단어갯수, 의미) -> 입력

(TensorShape([32, 300]), TensorShape([32, 300, 100]))

In [26]:
# 훈련데이터의 모든 문자열을 숫자로 변경
vectorized_train_dataset = train_dataset.map(lambda review, label: (text_vectorizer(review), label))

In [28]:
#변환확인
for x, y in vectorized_train_dataset:
    print(x[:])
    print(y[:])
    break

tf.Tensor(
[[19700  1664  1789 ...     0     0     0]
 [20567     7     4 ...     0     0     0]
 [   10   110   517 ...     0     0     0]
 ...
 [   11     7   353 ...     0     0     0]
 [    4  3659     6 ...   165  5350  1512]
 [  696     3 85646 ...     0     0     0]], shape=(32, 300), dtype=int64)
tf.Tensor([0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0], shape=(32,), dtype=int32)


In [29]:
#모델구성
input = tf_keras.layers.Input(shape=(None,)) # 300 줘도 된다
x = tf_keras.layers.Embedding(input_dim = 100000, output_dim = 100)(input) # (300, 100)
x = tf_keras.layers.LSTM(units=16)(x)
output = tf_keras.layers.Dense(units = 1, activation="sigmoid")(x)

model = tf_keras.models.Model(inputs = input, outputs = output)

model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 100)         10000000  
                                                                 
 lstm_2 (LSTM)               (None, 16)                7488      
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 10007505 (38.18 MB)
Trainable params: 10007505 (38.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [30]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics= 'accuracy')

In [31]:
hisotry = model.fit(vectorized_train_dataset, epochs= 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
