In [25]:
import os 
import datetime
import tensorflow as tf
from tensorflow import keras
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [26]:
# 미리 전처리한 데이터 로드(train_data)
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA_FILE_NAME = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME = 'nsmc_train_label.npy'
DATA_CONFIGS_FILE_NAME = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS_FILE_NAME, 'r'))

In [27]:
input_data.shape, label_data.shape

((149995, 8), (149995,))

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(input_data, label_data,
                                                    test_size=0.2, random_state=42)

In [None]:
batch_size = 16
n_epochs = 10
vocab_size = prepro_configs['vocab_size'] + 1
embedding_size = 128

# Model(basic)

In [112]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, batch_size, input_shape=(None,)),
    keras.layers.Dropout(0.2),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(batch_size, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid') # 이진 분류
])
model.summary()

Model: "sequential_53"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_41 (Embedding)     (None, None, 16)          700128    
_________________________________________________________________
dropout_135 (Dropout)        (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_9 ( (None, 16)                0         
_________________________________________________________________
dropout_136 (Dropout)        (None, 16)                0         
_________________________________________________________________
dense_83 (Dense)             (None, 16)                272       
_________________________________________________________________
dropout_137 (Dropout)        (None, 16)                0         
_________________________________________________________________
dense_84 (Dense)             (None, 1)               

In [69]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Training

In [70]:
history = model.fit(X_train, y_train,
                   epochs=n_epochs,
                   batch_size=batch_size,
                   validation_data=(X_valid, y_valid),
                   verbose=1)

Train on 119996 samples, validate on 29999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluate

In [71]:
# 미리 전처리한 데이터 로드(test_data)
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [72]:
results = model.evaluate(test_input_data, test_label_data)
results



[0.4187378094143006, 0.8119487]

# CNN

In [116]:
model = keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding="VALID"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(250, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_55"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_43 (Embedding)     (None, None, 128)         5601024   
_________________________________________________________________
dropout_143 (Dropout)        (None, None, 128)         0         
_________________________________________________________________
conv1d_100 (Conv1D)          (None, None, 128)         49280     
_________________________________________________________________
dropout_144 (Dropout)        (None, None, 128)         0         
_________________________________________________________________
global_max_pooling1d_62 (Glo (None, 128)               0         
_________________________________________________________________
dense_87 (Dense)             (None, 250)               32250     
_________________________________________________________________
dropout_145 (Dropout)        (None, 250)             

## Training 

In [118]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
history = model.fit(X_train, y_train,
                   epochs=n_epochs,
                   batch_size=batch_size,
                   validation_data=(X_valid, y_valid),
                   verbose=1)

Train on 119996 samples, validate on 29999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluate

In [119]:
# 미리 전처리한 데이터 로드(test_data)
results = model.evaluate(test_input_data, test_label_data)
results



[0.4270623373952387, 0.81568897]

| models       | Train_accuracy | Validation_accuracy | Test_accuracy |
|--------------|----------------|---------------------|---------------|
| feed_forward | 85.61%         | 81.45%              | 81.19%        |
|      CNN     | 89.98%         | 81.82%              | 81.56%        |