# 악플 분류기 - 다중분류

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
smile_train = pd.read_csv("../data/multi label data/common_data.csv")

In [4]:
smile_train.drop(['교정문장', '띄어쓰기수정문장', '분쟁유발'], axis='columns', inplace=True)
smile_train

Unnamed: 0,문장,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타혐오,악플/욕설,clean
0,문재앙은 김정은의 운전사도 안되는 똥휴지다 개돼지들이 뽑은 재앙민국이 망해간다,0,0,0,0,0,0,0,0,1,0
1,근데 사람들이 우스갯소리로 일 재앙이라고 했는데 이제보니 장난아니고 일 재앙...,0,0,0,0,0,0,0,0,0,0
2,공무뭔들도 정권 바뀐다는걸 아는구나 이놈들은 원래 연줄이 밥줄이라 눈치는 백단이지 ...,0,0,0,0,0,0,0,0,1,0
3,조옷도 모르는게 가만히 쳐있어 그냥 주둥이 나불대지말구 이 좌빨 니들 얘기하는...,0,0,0,0,0,0,0,0,0,0
4,토착빨갱이같으니라고,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
47404,께롱께롱께롱,0,0,0,0,0,0,0,0,0,1
47405,갱상도가아니라 홍어지 개좃같은 홍어년들,0,0,0,0,0,1,0,0,0,0
47406,말레이시아랑 인도네시아 여자 존나 못생겼던데,1,0,0,1,0,0,0,0,0,0
47407,링크도 안박고 가서 글 쓰자고 선동하네,0,0,0,0,0,0,0,0,1,0


In [5]:
from konlpy.tag import Mecab
from tqdm import tqdm
mecab = Mecab()

In [6]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()

## 데이터 전처리

In [8]:
# 중복 데이터 확인
print(smile_train.shape, smile_train.문장.nunique())

(47409, 11) 47409


In [9]:
# 분류가 안되어 있는 데이터 확인
print(smile_train[smile_train.sum(axis=1) == 0].index)

# 분류 안되어 있는 데이터 삭제
smile_train = smile_train[smile_train.sum(axis=1) != 0]

Int64Index([    1,     3,     4,     5,     6,     7,     8,     9,    12,
               13,
            ...
             4665,  4666,  4668,  4669,  4670,  4671,  4672,  4673, 38299,
            44356],
           dtype='int64', length=3497)


## 텍스트 전처리

In [10]:
# 한글 이외의 문자는 공백으로 처리하고 strip
smile_train.문장 = smile_train.문장.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ').str.strip()
smile_train.문장.replace('', np.nan, inplace=True)
print(smile_train.문장.isna().sum())
smile_train.dropna(how='any', inplace=True)
smile_train.reset_index(drop=True, inplace=True)
smile_train.shape

0


(43912, 11)

## 한글 형태소 분석

In [11]:
from konlpy.tag import Mecab
from tqdm.notebook import tqdm

In [12]:
mecab = Mecab()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [13]:
train_data = []
for sentence in tqdm(smile_train.문장):
  morphs = mecab.morphs(sentence)
  tmp_X = [word for word in morphs if word not in stopwords]
  train_data.append(tmp_X)

  0%|          | 0/43912 [00:00<?, ?it/s]

## 토큰화

In [14]:
import numpy as np
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data, smile_train.iloc[:, 1:], test_size=0.2)

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)

In [18]:
# 등장 빈도가 3 미만인 것의 갯수
threshold = 3
total_cnt = len(t.word_index)   # 단어의 수
rare_cnt = 0                    # 등장 빈도가 threshold 보다 작은 단어의 갯수
total_freq = 0                  # 훈련 데이터의 전체 단어의 빈도수의 합
rare_freq = 0                   # 등장 빈도가 threshold 보다 작은 단어의 등장 빈도수의 합

In [19]:
for key, value in t.word_counts.items():
  total_freq += value
  if value < threshold:
    rare_cnt += 1
    rare_freq += value

In [20]:
print('단어 집합(vocabulary)의 크기 :', total_cnt)
print(f'등장 빈도가 {threshold - 1}번 이하인 희귀 단어의 수: {rare_cnt}')
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 35570
등장 빈도가 2번 이하인 희귀 단어의 수: 20492
단어 집합에서 희귀 단어의 비율: 57.61034579701996
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 3.0437820936317173


In [21]:
# 모든 단어 사용
vocab_size = total_cnt + 2
vocab_size

35572

In [22]:
t = Tokenizer(num_words=vocab_size, oov_token='OOV')
t.fit_on_texts(train_data)
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [23]:
import pickle

with open('tokenizer.pkl','wb') as f:
  pickle.dump(t,f)

In [24]:
# 데이터의 최대/평균 길이
max(len(s) for s in X_train), sum(map(len, X_train)) / len(X_train)

(164, 23.87093284750491)

In [25]:
# 악플 길이를 70으로 설정
max_len = 70

In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

X_train.shape, X_test.shape

((35129, 70), (8783, 70))

In [27]:
Y_train = Y_train.values
Y_test = Y_test.values

In [28]:
Y_train.shape, Y_test.shape

((35129, 10), (8783, 10))

## 모델 정의/설정/학습

### BiLSTM

In [29]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [30]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 128, input_length=max_len)(inputs)

x = Bidirectional(LSTM(128, return_sequences=True))(em)
x = Dropout(0.1)(x)
x = GlobalMaxPooling1D()(x)
outputs = Dense(10, activation='softmax')(x)

model2 = Model(inputs = inputs, outputs = outputs)
model2.summary()

2022-06-16 09:26:18.671554: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-16 09:26:18.744323: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-16 09:26:18.744458: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-16 09:26:18.745302: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 70)]              0         
                                                                 
 embedding (Embedding)       (None, 70, 128)           4553216   
                                                                 
 bidirectional (Bidirectiona  (None, 70, 256)          263168    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 70, 256)           0         
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                2570  

In [31]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2_path = 'best-bilstm.h5'
mc2 = ModelCheckpoint(model2_path, verbose=1, save_best_only=True)
es2 = EarlyStopping(patience=5)

In [32]:
hist = model2.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc2, es2]
)

Epoch 1/30


2022-06-16 09:26:26.012668: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8101


  1/220 [..............................] - ETA: 12:29 - loss: 2.7455 - accuracy: 0.1484

2022-06-16 09:26:27.206047: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 00001: val_loss improved from inf to 1.93467, saving model to best-bilstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.93467 to 1.55496, saving model to best-bilstm.h5
Epoch 3/30
Epoch 00003: val_loss improved from 1.55496 to 1.52554, saving model to best-bilstm.h5
Epoch 4/30
Epoch 00004: val_loss improved from 1.52554 to 1.47728, saving model to best-bilstm.h5
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.47728
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.47728
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.47728
Epoch 8/30
Epoch 00008: val_loss did not improve from 1.47728
Epoch 9/30
Epoch 00009: val_loss did not improve from 1.47728


In [33]:
best_model2 = load_model(model2_path)
best_model2.evaluate(X_test, Y_test)



[1.4603980779647827, 0.6612774729728699]

In [34]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [35]:
print(sentiment_predict('종북좌파 빨갱이새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model2,t, max_len=max_len))

'종북좌파 빨갱이새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 38.78412842750549%의 확률로 악플/욕설에 대한 악플입니다.
[5.3943645e-02 9.8933339e-02 2.9625958e-02 7.0890278e-02 1.1872431e-01
 5.2823655e-02 6.8609685e-02 1.1859394e-01 3.8784128e-01 1.3939148e-05]
None


### BiLSTM + LSTM

In [36]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [37]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 512, input_length=max_len)(inputs)

x = Bidirectional(LSTM(256, return_sequences=True))(em)
x = LayerNormalization(epsilon=1e-6)(em + x)
# x = Conv1D(256, 5, activation='relu')(x)
# x = GlobalMaxPooling1D()(x)
x = LSTM(512)(x)
# x = Dropout(0.1)(x)
outputs = Dense(10, activation='softmax')(x)

model3 = Model(inputs = inputs, outputs = outputs)
model3.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 70, 512)      18212864    ['input_2[0][0]']                
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 70, 512)     1574912     ['embedding_1[0][0]']            
 )                                                                                                
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 70, 512)     0           ['embedding_1[0][0]',      

In [38]:
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3_path = 'best-bilstm-lstm.h5'
mc3 = ModelCheckpoint(model3_path, verbose=1, save_best_only=True)
es3 = EarlyStopping(patience=5)
X_train = X_train.astype(float)

In [39]:
hist = model3.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=50, batch_size=128, callbacks=[mc3, es3]
)

Epoch 1/50
Epoch 00001: val_loss improved from inf to 1.17998, saving model to best-bilstm-lstm.h5
Epoch 2/50
Epoch 00002: val_loss did not improve from 1.17998
Epoch 3/50
Epoch 00003: val_loss did not improve from 1.17998
Epoch 4/50
Epoch 00004: val_loss did not improve from 1.17998
Epoch 5/50
Epoch 00005: val_loss did not improve from 1.17998
Epoch 6/50
Epoch 00006: val_loss did not improve from 1.17998


In [40]:
best_model3 = load_model(model3_path)
best_model3.evaluate(X_test, Y_test)



[1.170005440711975, 0.7728566527366638]

In [48]:
smile_train.iloc[:, 1:].values

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
a = best_model3.predict(X_test)

In [42]:
print(X_test[0],a[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0 3660  299 1708    7   45 1745
  142 1892   49   37 1618 3072 4991  183    8   22   15   26  131    2
   43 1204 1713  145    7    6    5  357   30   22   19 2705 3295    5] [1.8566119e-02 8.6000663e-01 1.2465653e-02 8.5147601e-03 7.9079801e-03
 8.3633262e-04 1.7706309e-03 1.8368376e-02 4.7879521e-02 2.3684096e-02]


In [43]:
smile_train[smile_train['개인지칭'] != 0]

KeyError: '개인지칭'

In [44]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [45]:
print(sentiment_predict('짱꺠 씨발놈들', best_model3,tokenizer=t, max_len=max_len))

'짱꺠 씨발놈들'
 85.09088158607483%의 확률로 인종/국적에 대한 악플입니다.
[1.2451771e-03 1.5109007e-03 1.0151444e-04 8.5090882e-01 5.2194246e-03
 1.0189397e-02 1.9893625e-03 3.2475218e-03 1.2555911e-01 2.8718128e-05]
None


### CNN + BiLSTM + LSTM

In [46]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, LayerNormalization, Conv1D, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [49]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 256, input_length=max_len)(inputs)

x = Conv1D(256, 5, activation='relu')(em)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128)(x)
# x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
outputs = Dense(10, activation='softmax')(x)

model4 = Model(inputs = inputs, outputs = outputs)
model4.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 70)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 70, 256)           9106432   
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 256)           327936    
                                                                 
 bidirectional_3 (Bidirectio  (None, 66, 256)          394240    
 nal)                                                            
                                                                 
 lstm_6 (LSTM)               (None, 128)               197120    
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                           

In [50]:
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4_path = 'best-cnn-bilstm-lstm.h5'
mc4 = ModelCheckpoint(model4_path, verbose=1, save_best_only=True)
es4 = EarlyStopping(patience=5)

In [51]:
hist = model4.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc4, es4]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.48260, saving model to best-cnn-bilstm-lstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.48260 to 1.35963, saving model to best-cnn-bilstm-lstm.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.35963
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.35963
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.35963
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.35963
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.35963


In [52]:
best_model4 = load_model(model4_path)
best_model4.evaluate(X_test, Y_test)



[1.3598785400390625, 0.7126266360282898]

In [53]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [54]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model4,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 76.64451003074646%의 확률로 악플/욕설에 대한 악플입니다.
[1.9820195e-02 1.0527811e-02 2.8749626e-02 5.3500343e-02 6.2547727e-03
 1.0713669e-02 5.3658128e-02 5.0156832e-02 7.6644510e-01 1.7356283e-04]
None


### LSTM

In [55]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [57]:
model10 = Sequential()
model10.add(Embedding(vocab_size,256))
model10.add(LSTM(64, return_sequences=True))
model10.add(LSTM(32))
model10.add(Dense(10,activation='softmax'))

model10.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 256)         9106432   
                                                                 
 lstm_9 (LSTM)               (None, None, 64)          82176     
                                                                 
 lstm_10 (LSTM)              (None, 32)                12416     
                                                                 
 dense_5 (Dense)             (None, 10)                330       
                                                                 
Total params: 9,201,354
Trainable params: 9,201,354
Non-trainable params: 0
_________________________________________________________________


In [58]:
model10.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model10_path = 'LSTM.h5'
mc10 = ModelCheckpoint(model10_path, verbose=1, save_best_only=True)
es10 = EarlyStopping(patience=5)

In [59]:
hist10 = model10.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc10, es10]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.03651, saving model to LSTM.h5
Epoch 2/30
Epoch 00002: val_loss improved from 2.03651 to 1.63259, saving model to LSTM.h5
Epoch 3/30
Epoch 00003: val_loss improved from 1.63259 to 1.48826, saving model to LSTM.h5
Epoch 4/30
Epoch 00004: val_loss improved from 1.48826 to 1.46816, saving model to LSTM.h5
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.46816
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.46816
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.46816
Epoch 8/30
Epoch 00008: val_loss did not improve from 1.46816
Epoch 9/30
Epoch 00009: val_loss did not improve from 1.46816


In [60]:
best_model10 = load_model(model10_path)
best_model10.evaluate(X_test, Y_test)



[1.4706467390060425, 0.6987361907958984]

In [112]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [61]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model10,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 81.13899230957031%의 확률로 악플/욕설에 대한 악플입니다.
[1.1817344e-02 1.1788056e-02 2.0742339e-04 1.0794081e-02 7.7460550e-02
 1.2709951e-02 8.5514765e-03 5.5103309e-02 8.1138992e-01 1.7784587e-04]
None


### Conv1D + LSTM

In [62]:
from tensorflow.keras.layers import MaxPooling1D

model6 = Sequential()
model6.add(Embedding(vocab_size, 512, input_length=max_len))
model6.add(Conv1D(32, 3, activation = 'relu'))
model6.add(MaxPooling1D(2))
model6.add(Conv1D(32, 3, activation = 'relu'))
model6.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model6.add(Dense(10,activation='softmax'))

model6.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 70, 512)           18212864  
                                                                 
 conv1d_2 (Conv1D)           (None, 68, 32)            49184     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 34, 32)           0         
 )                                                               
                                                                 
 conv1d_3 (Conv1D)           (None, 32, 32)            3104      
                                                                 
 lstm_11 (LSTM)              (None, 32)                8320      
                                                                 
 dense_6 (Dense)             (None, 10)                330       
                                                      

In [63]:
model6.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model6_path = 'Conv1D_LSTM.h5'
mc6 = ModelCheckpoint(model6_path, verbose=1, save_best_only=True)
es6 = EarlyStopping(patience=5)

In [64]:
hist6 = model6.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc6, es6]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.46950, saving model to Conv1D_LSTM.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.46950 to 1.30272, saving model to Conv1D_LSTM.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.30272
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.30272
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.30272
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.30272
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.30272


In [65]:
best_model6 = load_model(model6_path)
best_model6.evaluate(X_test, Y_test)



[1.3047595024108887, 0.7454172968864441]

In [126]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [66]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model6,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 77.11883783340454%의 확률로 악플/욕설에 대한 악플입니다.
[0.01707654 0.00433943 0.00714698 0.01873564 0.00339313 0.05533494
 0.00262584 0.118674   0.7711884  0.00148509]
None


### Conv1D + BiLSTM

In [67]:
from tensorflow.keras.layers import MaxPooling1D

model7 = Sequential()
model7.add(Embedding(vocab_size, 512, input_length=max_len))
model7.add(Conv1D(32, 3, activation = 'relu'))
model7.add(MaxPooling1D(2))
model7.add(Conv1D(32, 3, activation = 'relu'))
model7.add(Bidirectional(LSTM(32, return_sequences=True)))
model7.add(GlobalMaxPooling1D())
model7.add(Dense(10,activation='softmax'))

model7.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 70, 512)           18212864  
                                                                 
 conv1d_4 (Conv1D)           (None, 68, 32)            49184     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 34, 32)           0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 32, 32)            3104      
                                                                 
 bidirectional_4 (Bidirectio  (None, 32, 64)           16640     
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)              

In [68]:
model7.compile('adam', 'categorical_crossentropy', ['accuracy'])
model7_path = 'Conv1D_BiLSTM.h5'
mc7 = ModelCheckpoint(model7_path, verbose=1, save_best_only=True)
es7 = EarlyStopping(patience=5)

In [69]:
hist7 = model7.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=100, batch_size=64, callbacks=[mc7, es7]
)

Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.44538, saving model to Conv1D_BiLSTM.h5
Epoch 2/100
Epoch 00002: val_loss improved from 1.44538 to 1.40377, saving model to Conv1D_BiLSTM.h5
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.40377
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.40377
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.40377
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.40377
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.40377


In [70]:
best_model7 = load_model(model7_path)
best_model7.evaluate(X_test, Y_test)



[1.3981177806854248, 0.7114881277084351]

In [135]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    epochs=30, batch_size=32, callbacks=[mc8, es8]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [71]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model7,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 69.932621717453%의 확률로 악플/욕설에 대한 악플입니다.
[9.0369083e-02 2.5155197e-03 1.2356249e-02 1.2679873e-02 1.5516406e-01
 6.4723648e-04 4.9875593e-03 2.1878414e-02 6.9932622e-01 7.5757089e-05]
None


### GRU

In [72]:
from tensorflow.keras.layers import GRU

In [73]:
inputs = Input(shape=(max_len,))

model8 = Sequential()

model8.add(Embedding(vocab_size, 512, input_length=max_len))
model8.add(GRU(128))
model8.add(Dense(10, activation='softmax'))

model8.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 70, 512)           18212864  
                                                                 
 gru (GRU)                   (None, 128)               246528    
                                                                 
 dense_8 (Dense)             (None, 10)                1290      
                                                                 
Total params: 18,460,682
Trainable params: 18,460,682
Non-trainable params: 0
_________________________________________________________________


In [74]:
model8.compile('adam', 'categorical_crossentropy', ['accuracy'])
model8_path = 'GRU.h5'
mc8 = ModelCheckpoint(model8_path, verbose=1, save_best_only=True)
es8 = EarlyStopping(patience=5)

In [75]:
hist8 = model8.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=32, callbacks=[mc8, es8]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.15553, saving model to GRU.h5
Epoch 2/30
Epoch 00002: val_loss did not improve from 1.15553
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.15553
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.15553
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.15553
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.15553


In [76]:
best_model8 = load_model(model8_path)
best_model8.evaluate(X_test, Y_test)



[1.1497737169265747, 0.7634065747261047]

In [44]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [77]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model8,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 72.1695601940155%의 확률로 악플/욕설에 대한 악플입니다.
[6.9866166e-02 3.5645917e-02 1.0984234e-02 3.2379553e-02 3.2086052e-02
 1.0130853e-02 1.5756112e-02 7.0944749e-02 7.2169560e-01 5.1076728e-04]
None


In [78]:
inputs = Input(shape=(max_len,))

model9 = Sequential()

model9.add(Embedding(vocab_size, 512, input_length=max_len))
model9.add(Conv1D(32, 5, activation = 'relu'))
model9.add(GRU(32))
model9.add(Dense(10, activation='softmax'))

model9.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 70, 512)           18212864  
                                                                 
 conv1d_6 (Conv1D)           (None, 66, 32)            81952     
                                                                 
 gru_1 (GRU)                 (None, 32)                6336      
                                                                 
 dense_9 (Dense)             (None, 10)                330       
                                                                 
Total params: 18,301,482
Trainable params: 18,301,482
Non-trainable params: 0
_________________________________________________________________


In [79]:
model9.compile('adam', 'categorical_crossentropy', ['accuracy'])
model9_path = 'Conv1D_GRU.h5'
mc9 = ModelCheckpoint(model9_path, verbose=1, save_best_only=True)
es9 = EarlyStopping(patience=5)

In [80]:
hist9 = model9.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=100, batch_size=64, callbacks=[mc9, es9]
)

Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.21036, saving model to Conv1D_GRU.h5
Epoch 2/100
Epoch 00002: val_loss improved from 1.21036 to 1.19940, saving model to Conv1D_GRU.h5
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.19940
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.19940
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.19940
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.19940
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.19940


In [81]:
best_model9 = load_model(model9_path)
loss9, acc9 = best_model9.evaluate(X_test, Y_test)
loss9, acc9



(1.1854721307754517, 0.7671638131141663)

In [82]:
print(sentiment_predict('안녕 개새끼들아', best_model9,tokenizer=t, max_len=max_len))

'안녕 개새끼들아'
 59.7104549407959%의 확률로 악플/욕설에 대한 악플입니다.
[0.01004287 0.01034687 0.00304204 0.0219156  0.02924209 0.06849256
 0.00078374 0.05692672 0.59710455 0.20210299]
None
