In [1]:
!pip install tensorflow



In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, GlobalMaxPooling1D,
    Dense, Dropout, Concatenate
)
import numpy as np

In [3]:
# --- 1. 하이퍼파라미터 및 설정 ---
VOCAB_SIZE = 10000    # Keras IMDB 로더는 어휘 사전 크기를 미리 지정
MAX_LEN = 250         # 문장의 최대 길이 (패딩 기준)
EMBED_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5] # PyTorch 버전과 동일
NUM_CLASSES = 1       # (Keras에서는 긍정/부정(0, 1) 분류 시 1로 설정)
BATCH_SIZE = 64
EPOCHS = 3

In [4]:
# --- 2. 데이터 준비 (Keras IMDB) ---

# 2.1. 데이터 로드 (매우 간편함)
# Keras는 데이터를 이미 토큰화하고 정수 인덱스로 변환해 줍니다.
print(f"Loading IMDB data (Top {VOCAB_SIZE} words)...")
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

print(f"Train/Test 데이터 수: {len(x_train)} / {len(x_test)}")

# 2.2. 패딩 (Collate Function 대신 사용)
#
# PyTorch의 'collate_batch'와 동일한 역할.
# 모든 문장을 MAX_LEN 길이에 맞추고, 짧으면 뒤(<pad>)를 0으로 채움.
print("Padding sequences...")
x_train_pad = pad_sequences(x_train, maxlen=MAX_LEN, padding='post')
x_test_pad = pad_sequences(x_test, maxlen=MAX_LEN, padding='post')

print(f"Padded train data shape: {x_train_pad.shape}")

Loading IMDB data (Top 10000 words)...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Train/Test 데이터 수: 25000 / 25000
Padding sequences...
Padded train data shape: (25000, 250)


In [5]:

# --- 3. TextCNN 모델 정의 (Keras Functional API) ---
# (PyTorch의 nn.Module 클래스 정의와 동일한 역할)

def build_text_cnn_model():
  # 1. 입력 레이어
  # [BATCH_SIZE, MAX_LEN] 형태의 정수 시퀀스가 들어올 것임
  inputs = Input(shape=(MAX_LEN,), dtype='int32')

  # 2. 임베딩 레이어 (nn.Embedding과 동일)
  # [BATCH_SIZE, MAX_LEN] -> [BATCH_SIZE, MAX_LEN, EMBED_DIM]
  embedding_layer = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBED_DIM,
    input_length=MAX_LEN
  )
  embedded = embedding_layer(inputs)

  # 3. Convolution + Max Pooling (병렬 처리)
  #
  conv_blocks = []
  for fs in FILTER_SIZES:
    # (PyTorch의 nn.Conv2d 대신 Conv1D를 사용하여 더 직관적)
    conv = Conv1D(
      filters=NUM_FILTERS,
      kernel_size=fs, # fs = 3, 4, 5
      activation='relu'
    )(embedded)

    # (PyTorch의 F.max_pool1d와 동일)
    pool = GlobalMaxPooling1D()(conv)
    conv_blocks.append(pool)

  # 4. 모든 필터 결과 결합 (torch.cat과 동일)
  # [BATCH_SIZE, NUM_FILTERS * len(FILTER_SIZES)]
  concatenated = Concatenate()(conv_blocks)

  # 5. Dropout
  dropped = Dropout(0.5)(concatenated)

  # 6. Fully Connected Layer
  # (이진 분류이므로 sigmoid와 1개 유닛 사용)
  outputs = Dense(NUM_CLASSES, activation='sigmoid')(dropped)

  # 7. 모델 생성
  model = Model(inputs=inputs, outputs=outputs)
  return model

model = build_text_cnn_model()



In [6]:
# --- 4. 모델 컴파일 (Compile) ---
# (PyTorch의 criterion, optimizer 설정과 동일)
model.compile(
  optimizer='adam',
  loss='binary_crossentropy', # 이진 분류 손실
  metrics=['accuracy']
)

# 모델 구조 요약
model.summary()

In [7]:
# --- 5. 모델 학습 (Fit) ---
# (PyTorch의 수동 학습 루프(train_loop)와 동일)
print("\n--- 모델 학습 시작 ---")
history = model.fit(
  x_train_pad,
  y_train,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_data=(x_test_pad, y_test)
)

print("--- 모델 학습 완료 ---")


--- 모델 학습 시작 ---
Epoch 1/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 428ms/step - accuracy: 0.6713 - loss: 0.5773 - val_accuracy: 0.8694 - val_loss: 0.3103
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 403ms/step - accuracy: 0.8875 - loss: 0.2736 - val_accuracy: 0.8865 - val_loss: 0.2705
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 407ms/step - accuracy: 0.9346 - loss: 0.1771 - val_accuracy: 0.8858 - val_loss: 0.2817
--- 모델 학습 완료 ---


In [8]:

# --- 6. (보너스) 새로운 리뷰 예측 ---

# Keras IMDB 데이터셋의 단어-인덱스 사전 로드
word_index = imdb.get_word_index()
# Keras가 예약한 특수 토큰 인덱스 (0: <pad>, 1: <start>, 2: <unk>, 3: <unused>)
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<pad>"] = 0
word_index["<start>"] = 1
word_index["<unk>"] = 2


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [9]:

def predict_sentiment_keras(model, text):
  # 1. 텍스트 전처리 (토큰화 -> 인덱싱)
  tokens = text.lower().split()
  indices = [word_index.get(word, 2) for word in tokens] # 모르면 <unk>(2)

  # 2. 패딩
  padded_text = pad_sequences([indices], maxlen=MAX_LEN, padding='post')

  # 3. 예측
  prediction = model.predict(padded_text, verbose=0)
  prob = prediction[0][0]

  if prob > 0.5:
    return f"긍정 (Positive) (확률: {prob*100:.2f}%)"
  else:
    return f"부정 (Negative) (확률: {(1-prob)*100:.2f}%)"


In [10]:

# 테스트
print("\n--- 새로운 리뷰 예측 테스트 ---")
test_review_1 = "This movie was absolutely fantastic, the best I have seen in years!"
test_review_2 = "It was a complete waste of time. The acting was terrible."

print(f"Review 1: {test_review_1}")
print(f"Prediction: {predict_sentiment_keras(model, test_review_1)}")
print("-" * 20)
print(f"Review 2: {test_review_2}")
print(f"Prediction: {predict_sentiment_keras(model, test_review_2)}")


--- 새로운 리뷰 예측 테스트 ---
Review 1: This movie was absolutely fantastic, the best I have seen in years!
Prediction: 긍정 (Positive) (확률: 69.83%)
--------------------
Review 2: It was a complete waste of time. The acting was terrible.
Prediction: 부정 (Negative) (확률: 99.74%)
