# 악플 분류기 - 다중분류

In [6]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 데이터 불러오기
df = pd.read_csv("/content/common_data1.csv", encoding='utf-8')

In [4]:
df.iloc[:, 1:].sum()

여성/가족     5195.0
남성        4819.0
성소수자      2116.0
인종/국적     4971.0
연령        4988.0
지역        5036.0
종교        4719.0
기타혐오      4974.0
악플/욕설    10894.0
clean     4647.0
분쟁유발      7681.0
dtype: float64

In [275]:
smile_train = pd.read_csv("/content/unsmile_train_v1.0.tsv", delimiter='\t')

In [313]:
smile_train = smile_train[smile_train['개인지칭'] == 0]

In [314]:
strain_data = []
for sentence in tqdm(smile_train.문장):
  morphs = mecab.morphs(sentence)
  tmp_X = [word for word in morphs if word not in stopwords]
  strain_data.append(tmp_X)

  0%|          | 0/14690 [00:00<?, ?it/s]

In [315]:
strain_data = t.texts_to_sequences(strain_data)

In [316]:
strain_data = pad_sequences(strain_data, maxlen=max_len)

In [317]:
strain_data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 7097,    2,  442, 1369,    3,  120,  154,  241,    8,
       2694], dtype=int32)

## 데이터 전처리

In [7]:
# 중복 데이터 확인
print(df.shape, df.문장.nunique())

(47409, 12) 47409


In [9]:
# 분류가 안되어 있는 데이터 확인
print(df[df.sum(axis=1) == 0].index)

# 분류 안되어 있는 데이터 삭제
df = df[df.sum(axis=1) != 0]

Int64Index([38299, 44356], dtype='int64')


## 텍스트 전처리

In [10]:
# 한글 이외의 문자는 공백으로 처리하고 strip
df.문장 = df.문장.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ').str.strip()
df.문장.replace('', np.nan, inplace=True)
print(df.문장.isna().sum())
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

0


(47407, 12)

## 한글 형태소 분석

In [11]:
# Mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab_light_220429.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 115 (delta 11), reused 10 (delta 3), pack-reused 91[K
Receiving objects: 100% (115/115), 1.27 MiB | 9.57 MiB/s, done.
Resolving deltas: 100% (50/50), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.2 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 70.6 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-

In [17]:
from konlpy.tag import Mecab
from tqdm.notebook import tqdm

In [18]:
mecab = Mecab()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [19]:
train_data = []
for sentence in tqdm(df.문장):
  morphs = mecab.morphs(sentence)
  tmp_X = [word for word in morphs if word not in stopwords]
  train_data.append(tmp_X)

  0%|          | 0/47407 [00:00<?, ?it/s]

## 토큰화

In [20]:
import numpy as np
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [14]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data, df.iloc[:, 1:], test_size=0.2)

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)

In [23]:
# 등장 빈도가 3 미만인 것의 갯수
threshold = 3
total_cnt = len(t.word_index)   # 단어의 수
rare_cnt = 0                    # 등장 빈도가 threshold 보다 작은 단어의 갯수
total_freq = 0                  # 훈련 데이터의 전체 단어의 빈도수의 합
rare_freq = 0                   # 등장 빈도가 threshold 보다 작은 단어의 등장 빈도수의 합

In [24]:
for key, value in t.word_counts.items():
  total_freq += value
  if value < threshold:
    rare_cnt += 1
    rare_freq += value

In [25]:
print('단어 집합(vocabulary)의 크기 :', total_cnt)
print(f'등장 빈도가 {threshold - 1}번 이하인 희귀 단어의 수: {rare_cnt}')
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 36985
등장 빈도가 2번 이하인 희귀 단어의 수: 21328
단어 집합에서 희귀 단어의 비율: 57.66662160335271
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 2.926213551633787


In [26]:
# 모든 단어 사용
vocab_size = total_cnt + 2
vocab_size

36987

In [27]:
t = Tokenizer(num_words=vocab_size, oov_token='OOV')
t.fit_on_texts(train_data)
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [175]:
import pickle

with open('tokenizer.pkl','wb') as f:
  pickle.dump(t,f)

In [28]:
# 데이터의 최대/평균 길이
max(len(s) for s in X_train), sum(map(len, X_train)) / len(X_train)

(164, 23.96089650626236)

In [29]:
# 악플 길이를 100으로 설정
max_len = 100

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

X_train.shape, X_test.shape

((37925, 100), (9482, 100))

In [121]:
Y_train = Y_train.values
Y_test = Y_test.values

In [122]:
Y_train.shape, Y_test.shape

((37925, 11), (9482, 11))

## 모델 정의/설정/학습

### LSTM + 어텐션

In [36]:
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [41]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = Dense(units)
    self.W2 = Dense(units)
    self.V = Dense(1)

  def call(self, values, query): # 단, key와 value는 같음
    # query shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원을 변경해줍니다.
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [46]:
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = Embedding(vocab_size, 128, input_length=max_len, mask_zero = True)(sequence_input)

lstm = Bidirectional(LSTM(64, dropout=0.5, return_sequences = True))(embedded_sequences)
lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional \
  (LSTM(64, dropout=0.5, return_sequences=True, return_state=True))(lstm)

state_h = Concatenate()([forward_h, backward_h]) # 은닉 상태
state_c = Concatenate()([forward_c, backward_c]) # 셀 상태

attention = BahdanauAttention(64) # 가중치 크기 정의
context_vector, attention_weights = attention(lstm, state_h)

dense1 = Dense(20, activation="relu")(context_vector)
dropout = Dropout(0.5)(dense1)
output = Dense(11, activation="softmax")(dropout)
model5 = Model(inputs=sequence_input, outputs=output)

In [47]:
model5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model5_path = 'best-lstm-attention.h5py'
mc5 = ModelCheckpoint(model5_path, verbose=1, save_best_only=True)
es5 = EarlyStopping(patience=5)

In [None]:
hist = model5.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=100, batch_size=128, callbacks=[mc5, es5]
)

### 트랜스포머

In [321]:
import tensorflow as tf
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [322]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim # d_model
        self.num_heads = num_heads

        assert embedding_dim % self.num_heads == 0

        self.projection_dim = embedding_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embedding_dim)
        self.key_dense = tf.keras.layers.Dense(embedding_dim)
        self.value_dense = tf.keras.layers.Dense(embedding_dim)
        self.dense = tf.keras.layers.Dense(embedding_dim)

    def scaled_dot_product_attention(self, query, key, value):
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        logits = matmul_qk / tf.math.sqrt(depth)
        attention_weights = tf.nn.softmax(logits, axis=-1)
        output = tf.matmul(attention_weights, value)
        return output, attention_weights

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]

        # (batch_size, seq_len, embedding_dim)
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        # (batch_size, num_heads, seq_len, projection_dim)
        query = self.split_heads(query, batch_size)  
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention, _ = self.scaled_dot_product_attention(query, key, value)
        # (batch_size, seq_len, num_heads, projection_dim)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  

        # (batch_size, seq_len, embedding_dim)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_dim))
        outputs = self.dense(concat_attention)
        return outputs

In [323]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(dff, activation="relu"),
             tf.keras.layers.Dense(embedding_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs) # 첫번째 서브층 : 멀티 헤드 어텐션
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) # Add & Norm
        ffn_output = self.ffn(out1) # 두번째 서브층 : 포지션 와이즈 피드 포워드 신경망
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) # Add & Norm

In [324]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.pos_emb = tf.keras.layers.Embedding(max_len, embedding_dim)

    def call(self, x):
        max_len = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [325]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)

In [336]:
embedding_dim = 32  # 각 단어의 임베딩 벡터의 차원
num_heads = 2  # 어텐션 헤드의 수
dff = 32  # 포지션 와이즈 피드 포워드 신경망의 은닉층의 크기

inputs = tf.keras.layers.Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(11, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [337]:
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model_path = 'best-transforemr-attention.h5py'
mc = ModelCheckpoint(model_path, verbose=1, save_best_only=True)
es = EarlyStopping(patience=3)

In [339]:
history = model.fit(X_train, Y_train, validation_split=0.2,
                    batch_size=128, epochs=100, callbacks=[mc, es])

Epoch 1/100
Epoch 1: val_loss improved from inf to 11.09348, saving model to best-transforemr-attention.h5py




INFO:tensorflow:Assets written to: best-transforemr-attention.h5py/assets


INFO:tensorflow:Assets written to: best-transforemr-attention.h5py/assets


Epoch 2/100
Epoch 2: val_loss did not improve from 11.09348
Epoch 3/100
Epoch 3: val_loss did not improve from 11.09348
Epoch 4/100
Epoch 4: val_loss did not improve from 11.09348


In [340]:
best_model = load_model(model_path)
best_model.evaluate(X_test, Y_test)



[11.011927604675293, 0.0824720486998558]

### BiLSTM

In [61]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [65]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 128, input_length=max_len)(inputs)

x = Bidirectional(LSTM(128, return_sequences=True))(em)
x = Dropout(0.1)(x)
x = GlobalMaxPooling1D()(x)
outputs = Dense(11, activation='softmax')(x)

model2 = Model(inputs = inputs, outputs = outputs)
model2.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_9 (Embedding)     (None, 100, 128)          4734336   
                                                                 
 bidirectional_9 (Bidirectio  (None, 100, 256)         263168    
 nal)                                                            
                                                                 
 dropout_13 (Dropout)        (None, 100, 256)          0         
                                                                 
 global_max_pooling1d_3 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_37 (Dense)            (None, 11)                2827

In [67]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2_path = 'best-bilstm.h5'
mc2 = ModelCheckpoint(model2_path, verbose=1, save_best_only=True)
es2 = EarlyStopping(patience=5)

In [68]:
hist = model2.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc2, es2]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 2.51077, saving model to best-bilstm.h5
Epoch 2/30
Epoch 2: val_loss improved from 2.51077 to 1.99253, saving model to best-bilstm.h5
Epoch 3/30
Epoch 3: val_loss improved from 1.99253 to 1.98132, saving model to best-bilstm.h5
Epoch 4/30
Epoch 4: val_loss did not improve from 1.98132
Epoch 5/30
Epoch 5: val_loss did not improve from 1.98132
Epoch 6/30
Epoch 6: val_loss did not improve from 1.98132
Epoch 7/30
Epoch 7: val_loss did not improve from 1.98132
Epoch 8/30
Epoch 8: val_loss did not improve from 1.98132


In [69]:
best_model2 = load_model(model2_path)
best_model2.evaluate(X_test, Y_test)



[1.9720739126205444, 0.5127609968185425]

### BiLSTM + LSTM

In [70]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [255]:
del model3

In [256]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 512, input_length=max_len)(inputs)

x = Bidirectional(LSTM(256, return_sequences=True))(em)
x = LayerNormalization(epsilon=1e-6)(em + x)
# x = Conv1D(256, 5, activation='relu')(x)
# x = GlobalMaxPooling1D()(x)
x = LSTM(512)(x)
# x = Dropout(0.1)(x)
outputs = Dense(11, activation='softmax')(x)

model3 = Model(inputs = inputs, outputs = outputs)
model3.summary()

Model: "model_29"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_34 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 embedding_35 (Embedding)       (None, 100, 512)     18937344    ['input_34[0][0]']               
                                                                                                  
 bidirectional_35 (Bidirectiona  (None, 100, 512)    1574912     ['embedding_35[0][0]']           
 l)                                                                                               
                                                                                                  
 tf.__operators__.add_20 (TFOpL  (None, 100, 512)    0           ['embedding_35[0][0]',    

In [257]:
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3_path = 'best-bilstm-lstm.h5'
mc3 = ModelCheckpoint(model3_path, verbose=1, save_best_only=True)
es3 = EarlyStopping(patience=5)

In [258]:
hist = model3.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=50, batch_size=128, callbacks=[mc3, es3]
)

Epoch 1/50
Epoch 1: val_loss improved from inf to 1.50746, saving model to best-bilstm-lstm.h5
Epoch 2/50
Epoch 2: val_loss did not improve from 1.50746
Epoch 3/50
Epoch 3: val_loss did not improve from 1.50746
Epoch 4/50
Epoch 4: val_loss did not improve from 1.50746
Epoch 5/50
Epoch 5: val_loss did not improve from 1.50746
Epoch 6/50
Epoch 6: val_loss did not improve from 1.50746


In [259]:
best_model3 = load_model(model3_path)
best_model3.evaluate(X_test, Y_test)



[1.4835562705993652, 0.7310693860054016]

In [319]:
smile_train.iloc[:, 1:].values

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [320]:
best_model3.evaluate(strain_data, smile_train.iloc[:, 1:].values)



[1.3336364030838013, 0.6649421453475952]

In [267]:
a = best_model3.predict(X_test)

In [310]:
smile_train[smile_train['개인지칭'] != 0]

Unnamed: 0,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭
0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
15000,0,0,0,0,0,0,0,0,0,1,0
15001,0,0,0,0,0,1,0,0,0,0,0
15002,1,0,0,1,0,0,0,0,0,0,0
15003,0,0,0,0,0,0,0,0,1,0,0


In [260]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = df.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.")

In [308]:
num=15000
text = smile_train['문장'][num]
print(sentiment_predict(text, best_model3,tokenizer=t, max_len=max_len))
print(smile_train.loc[num])

'께롱께롱께롱'
 87.7479076385498%의 확률로 clean에 대한 악플입니다.
None
문장       께롱께롱께롱!!!
여성/가족            0
남성               0
성소수자             0
인종/국적            0
연령               0
지역               0
종교               0
기타 혐오            0
악플/욕설            0
clean            1
개인지칭             0
Name: 15000, dtype: object


In [274]:
num=9000
text = df['문장'][num]
print(sentiment_predict(text, best_model3,tokenizer=t, max_len=max_len))
print(df.loc[num])

'뭔 저런 벙신이 검사출신이데  등신도 저런 등신은 없었네  역대급 등신 출몰이네  아니면 기레기들 소음 까지도 기사로 취급하는 놈 까지도    이지'
 47.057145833969116%의 확률로 기타혐오에 대한 악플입니다.
None
문장       뭔 저런 벙신이 검사출신이데  등신도 저런 등신은 없었네  역대급 등신 출몰이네  ...
여성/가족                                                    0
남성                                                       0
성소수자                                                     0
인종/국적                                                    0
연령                                                       0
지역                                                       0
종교                                                       0
기타혐오                                                     1
악플/욕설                                                    1
clean                                                    0
분쟁유발                                                   0.0
Name: 9000, dtype: object


### CNN + BiLSTM + LSTM

In [178]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, LayerNormalization, Conv1D, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [208]:
del model4

In [209]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 256, input_length=max_len)(inputs)

x = Conv1D(256, 5, activation='relu')(em)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128)(x)
# x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
outputs = Dense(11, activation='softmax')(x)

model4 = Model(inputs = inputs, outputs = outputs)
model4.summary()

Model: "model_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_22 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_23 (Embedding)    (None, 100, 256)          9468672   
                                                                 
 conv1d_4 (Conv1D)           (None, 96, 256)           327936    
                                                                 
 bidirectional_23 (Bidirecti  (None, 96, 256)          394240    
 onal)                                                           
                                                                 
 lstm_37 (LSTM)              (None, 128)               197120    
                                                                 
 dropout_30 (Dropout)        (None, 128)               0         
                                                          

In [210]:
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4_path = 'best-cnn-bilstm-lstm.h5'
mc4 = ModelCheckpoint(model4_path, verbose=1, save_best_only=True)
es4 = EarlyStopping(patience=5)

In [211]:
hist = model4.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc4, es4]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 1.95072, saving model to best-cnn-bilstm-lstm.h5
Epoch 2/30
Epoch 2: val_loss improved from 1.95072 to 1.63866, saving model to best-cnn-bilstm-lstm.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 1.63866
Epoch 4/30
Epoch 4: val_loss did not improve from 1.63866
Epoch 5/30
Epoch 5: val_loss did not improve from 1.63866
Epoch 6/30
Epoch 6: val_loss did not improve from 1.63866
Epoch 7/30
Epoch 7: val_loss did not improve from 1.63866


In [212]:
best_model4 = load_model(model4_path)
best_model4.evaluate(X_test, Y_test)



[1.6609307527542114, 0.6169584393501282]