# 악플 분류기 - 다중분류

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
smile_train = pd.read_csv("../data/multi label data/common_data.csv")

In [5]:
smile_train.drop(['교정문장', '띄어쓰기수정문장'], axis='columns', inplace=True)
smile_train

Unnamed: 0,문장,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타혐오,악플/욕설,clean,분쟁유발
0,문재앙은 김정은의 운전사도 안되는 똥휴지다 개돼지들이 뽑은 재앙민국이 망해간다,0,0,0,0,0,0,0,0,1,0,1.0
1,근데 사람들이 우스갯소리로 일 재앙이라고 했는데 이제보니 장난아니고 일 재앙...,0,0,0,0,0,0,0,0,0,0,1.0
2,공무뭔들도 정권 바뀐다는걸 아는구나 이놈들은 원래 연줄이 밥줄이라 눈치는 백단이지 ...,0,0,0,0,0,0,0,0,1,0,1.0
3,조옷도 모르는게 가만히 쳐있어 그냥 주둥이 나불대지말구 이 좌빨 니들 얘기하는...,0,0,0,0,0,0,0,0,0,0,1.0
4,토착빨갱이같으니라고,0,0,0,0,0,0,0,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
47404,께롱께롱께롱,0,0,0,0,0,0,0,0,0,1,0.0
47405,갱상도가아니라 홍어지 개좃같은 홍어년들,0,0,0,0,0,1,0,0,0,0,0.0
47406,말레이시아랑 인도네시아 여자 존나 못생겼던데,1,0,0,1,0,0,0,0,0,0,0.0
47407,링크도 안박고 가서 글 쓰자고 선동하네,0,0,0,0,0,0,0,0,1,0,0.0


In [6]:
from konlpy.tag import Mecab
from tqdm import tqdm
mecab = Mecab()

In [6]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()

## 데이터 전처리

In [7]:
# 중복 데이터 확인
print(smile_train.shape, smile_train.문장.nunique())

(47409, 12) 47409


In [8]:
# 분류가 안되어 있는 데이터 확인
print(smile_train[smile_train.sum(axis=1) == 0].index)

# 분류 안되어 있는 데이터 삭제
smile_train = smile_train[smile_train.sum(axis=1) != 0]

Int64Index([38299, 44356], dtype='int64')


## 텍스트 전처리

In [9]:
# 한글 이외의 문자는 공백으로 처리하고 strip
smile_train.문장 = smile_train.문장.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ').str.strip()
smile_train.문장.replace('', np.nan, inplace=True)
print(smile_train.문장.isna().sum())
smile_train.dropna(how='any', inplace=True)
smile_train.reset_index(drop=True, inplace=True)
smile_train.shape

0


(47407, 12)

## 한글 형태소 분석

In [11]:
from konlpy.tag import Mecab
from tqdm.notebook import tqdm

In [12]:
mecab = Mecab()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [13]:
train_data = []
for sentence in tqdm(smile_train.문장):
  morphs = mecab.morphs(sentence)
  tmp_X = [word for word in morphs if word not in stopwords]
  train_data.append(tmp_X)

  0%|          | 0/47407 [00:00<?, ?it/s]

## 토큰화

In [14]:
import numpy as np
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(train_data, smile_train.iloc[:, 1:], test_size=0.2)

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)

In [18]:
# 등장 빈도가 3 미만인 것의 갯수
threshold = 3
total_cnt = len(t.word_index)   # 단어의 수
rare_cnt = 0                    # 등장 빈도가 threshold 보다 작은 단어의 갯수
total_freq = 0                  # 훈련 데이터의 전체 단어의 빈도수의 합
rare_freq = 0                   # 등장 빈도가 threshold 보다 작은 단어의 등장 빈도수의 합

In [19]:
for key, value in t.word_counts.items():
  total_freq += value
  if value < threshold:
    rare_cnt += 1
    rare_freq += value

In [20]:
print('단어 집합(vocabulary)의 크기 :', total_cnt)
print(f'등장 빈도가 {threshold - 1}번 이하인 희귀 단어의 수: {rare_cnt}')
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 36985
등장 빈도가 2번 이하인 희귀 단어의 수: 21328
단어 집합에서 희귀 단어의 비율: 57.66662160335271
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 2.9262103314779724


In [21]:
# 모든 단어 사용
vocab_size = total_cnt + 2
vocab_size

36987

In [22]:
t = Tokenizer(num_words=vocab_size, oov_token='OOV')
t.fit_on_texts(train_data)
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [23]:
import pickle

with open('tokenizer.pkl','wb') as f:
  pickle.dump(t,f)

In [24]:
# 데이터의 최대/평균 길이
max(len(s) for s in X_train), sum(map(len, X_train)) / len(X_train)

(164, 23.960922874093605)

In [25]:
# 악플 길이를 70으로 설정
max_len = 70

In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

X_train.shape, X_test.shape

((37925, 70), (9482, 70))

In [27]:
Y_train = Y_train.values
Y_test = Y_test.values

In [28]:
Y_train.shape, Y_test.shape

((37925, 11), (9482, 11))

## 모델 정의/설정/학습

### BiLSTM

In [31]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [67]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 128, input_length=max_len)(inputs)

x = Bidirectional(LSTM(128, return_sequences=True))(em)
x = Dropout(0.1)(x)
x = GlobalMaxPooling1D()(x)
outputs = Dense(11, activation='softmax')(x)

model2 = Model(inputs = inputs, outputs = outputs)
model2.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 70)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 70, 128)           4734336   
                                                                 
 bidirectional_2 (Bidirectio  (None, 70, 256)          263168    
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 70, 256)           0         
                                                                 
 global_max_pooling1d_1 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_10 (Dense)            (None, 11)                2827

In [68]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2_path = 'best-bilstm.h5'
mc2 = ModelCheckpoint(model2_path, verbose=1, save_best_only=True)
es2 = EarlyStopping(patience=5)

In [69]:
hist = model2.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc2, es2]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.33801, saving model to best-bilstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 2.33801 to 2.08688, saving model to best-bilstm.h5
Epoch 3/30
Epoch 00003: val_loss improved from 2.08688 to 1.95171, saving model to best-bilstm.h5
Epoch 4/30
Epoch 00004: val_loss improved from 1.95171 to 1.90170, saving model to best-bilstm.h5
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.90170
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.90170
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.90170
Epoch 8/30
Epoch 00008: val_loss did not improve from 1.90170
Epoch 9/30
Epoch 00009: val_loss did not improve from 1.90170


In [70]:
best_model2 = load_model(model2_path)
best_model2.evaluate(X_test, Y_test)



[1.9146223068237305, 0.5499894618988037]

In [32]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [79]:
print(sentiment_predict('종북좌파 빨갱이새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model2,t, max_len=max_len))

'종북좌파 빨갱이새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 59.67366695404053%의 확률로 분쟁유발에 대한 악플입니다.
[1.0289536e-02 4.0037604e-03 2.7747632e-03 1.2924887e-02 8.9323103e-02
 4.1950587e-02 1.3393288e-02 1.3148430e-01 9.7117752e-02 1.3318263e-06
 5.9673667e-01]
None


### BiLSTM + LSTM

In [33]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [41]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 512, input_length=max_len)(inputs)

x = Bidirectional(LSTM(256, return_sequences=True))(em)
x = LayerNormalization(epsilon=1e-6)(em + x)
# x = Conv1D(256, 5, activation='relu')(x)
# x = GlobalMaxPooling1D()(x)
x = LSTM(512)(x)
# x = Dropout(0.1)(x)
outputs = Dense(11, activation='softmax')(x)

model3 = Model(inputs = inputs, outputs = outputs)
model3.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 70, 512)      18937344    ['input_3[0][0]']                
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 70, 512)     1574912     ['embedding_3[0][0]']            
 )                                                                                                
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 70, 512)     0           ['embedding_3[0][0]',      

In [42]:
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3_path = 'best-bilstm-lstm.h5'
mc3 = ModelCheckpoint(model3_path, verbose=1, save_best_only=True)
es3 = EarlyStopping(patience=5)
X_train = X_train.astype(float)

In [46]:
hist = model3.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=50, batch_size=128, callbacks=[mc3, es3]

Epoch 1/50
Epoch 00001: val_loss did not improve from 1.48607
Epoch 2/50
Epoch 00002: val_loss did not improve from 1.48607
Epoch 3/50
Epoch 00003: val_loss did not improve from 1.48607
Epoch 4/50
Epoch 00004: val_loss did not improve from 1.48607
Epoch 5/50
Epoch 00005: val_loss did not improve from 1.48607
Epoch 6/50
Epoch 00006: val_loss did not improve from 1.48607


In [47]:
best_model3 = load_model(model3_path)
best_model3.evaluate(X_test, Y_test)



[1.4627939462661743, 0.7281164526939392]

In [48]:
smile_train.iloc[:, 1:].values

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [50]:
a = best_model3.predict(X_test)

In [51]:
print(X_test[0],a[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0  430 1788  543  954] [6.5020239e-03 5.3583640e-03 8.5508255e-03 6.1273631e-03 1.4967654e-03
 6.5123998e-03 1.7228330e-02 2.2037890e-02 1.9144014e-01 7.3440826e-01
 3.3765769e-04]


In [63]:
smile_train[smile_train['개인지칭'] != 0]

Unnamed: 0,문장,여성/가족,남성,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭


In [58]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [62]:
print(sentiment_predict('짱꺠 씨발놈들', best_model3,tokenizer=t, max_len=max_len))

'짱꺠 씨발놈들'
 93.59042644500732%의 확률로 인종/국적에 대한 악플입니다.
[1.7940421e-03 7.3899966e-05 3.7657669e-06 9.3590426e-01 1.9234995e-04
 1.3980011e-03 2.4783649e-03 2.1128661e-03 5.4138895e-02 3.7260385e-08
 1.9035075e-03]
None


### CNN + BiLSTM + LSTM

In [34]:
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, GlobalMaxPooling1D, Dropout, Bidirectional, LayerNormalization, Conv1D, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [81]:
inputs = Input(shape=(max_len,))
em = Embedding(vocab_size, 256, input_length=max_len)(inputs)

x = Conv1D(256, 5, activation='relu')(em)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128)(x)
# x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
outputs = Dense(11, activation='softmax')(x)

model4 = Model(inputs = inputs, outputs = outputs)
model4.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 70)]              0         
                                                                 
 embedding_5 (Embedding)     (None, 70, 256)           9468672   
                                                                 
 conv1d (Conv1D)             (None, 66, 256)           327936    
                                                                 
 bidirectional_3 (Bidirectio  (None, 66, 256)          394240    
 nal)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 128)               197120    
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                           

In [82]:
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4_path = 'best-cnn-bilstm-lstm.h5'
mc4 = ModelCheckpoint(model4_path, verbose=1, save_best_only=True)
es4 = EarlyStopping(patience=5)

In [83]:
hist = model4.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc4, es4]
)

Epoch 1/30


2022-06-15 13:20:55.525586: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 00001: val_loss improved from inf to 1.85062, saving model to best-cnn-bilstm-lstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.85062 to 1.78418, saving model to best-cnn-bilstm-lstm.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.78418
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.78418
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.78418
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.78418
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.78418


In [84]:
best_model4 = load_model(model4_path)
best_model4.evaluate(X_test, Y_test)



[1.7812714576721191, 0.6264501214027405]

In [85]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [88]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model4,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 74.05869960784912%의 확률로 분쟁유발에 대한 악플입니다.
[1.6937053e-03 1.7733898e-04 1.7560368e-06 2.1665713e-03 2.4574753e-02
 1.9106083e-02 1.6512607e-04 1.4516079e-02 1.9701159e-01 1.9014257e-09
 7.4058700e-01]
None


### LSTM

In [35]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [107]:
model10 = Sequential()
model10.add(Embedding(vocab_size,256))
model10.add(LSTM(64, return_sequences=True))
model10.add(LSTM(32))
model10.add(Dense(11,activation='softmax'))

model10.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, None, 256)         9468672   
                                                                 
 lstm_16 (LSTM)              (None, None, 64)          82176     
                                                                 
 lstm_17 (LSTM)              (None, 32)                12416     
                                                                 
 dense_19 (Dense)            (None, 11)                363       
                                                                 
Total params: 9,563,627
Trainable params: 9,563,627
Non-trainable params: 0
_________________________________________________________________


In [108]:
model10.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model10_path = 'LSTM.h5'
mc10 = ModelCheckpoint(model10_path, verbose=1, save_best_only=True)
es10 = EarlyStopping(patience=5)

In [109]:
hist10 = model10.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc10, es10]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.85775, saving model to LSTM.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.85775 to 1.70981, saving model to LSTM.h5
Epoch 3/30
Epoch 00003: val_loss improved from 1.70981 to 1.65779, saving model to LSTM.h5
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.65779
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.65779
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.65779
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.65779
Epoch 8/30
Epoch 00008: val_loss did not improve from 1.65779


In [111]:
best_model10 = load_model(model10_path)
best_model10.evaluate(X_test, Y_test)



[1.684566855430603, 0.6723265051841736]

In [112]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [113]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model10,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 71.69331312179565%의 확률로 분쟁유발에 대한 악플입니다.
[9.2642097e-04 4.3856245e-04 2.7882159e-05 4.3939073e-03 1.8608904e-02
 6.4900190e-02 5.7229603e-04 2.2416074e-02 1.7078160e-01 1.0642806e-06
 7.1693313e-01]
None


### Conv1D + LSTM

In [36]:
from tensorflow.keras.layers import MaxPooling1D

model6 = Sequential()
model6.add(Embedding(vocab_size, 512, input_length=max_len))
model6.add(Conv1D(32, 3, activation = 'relu'))
model6.add(MaxPooling1D(2))
model6.add(Conv1D(32, 3, activation = 'relu'))
model6.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model6.add(Dense(11,activation='softmax'))

model6.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 512)           18937344  
                                                                 
 conv1d (Conv1D)             (None, 68, 32)            49184     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 34, 32)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 32, 32)            3104      
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 11)                363       
                                                        

2022-06-15 17:10:25.529730: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 17:10:25.534144: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 17:10:25.534272: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-15 17:10:25.534688: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [122]:
model6.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model6_path = 'Conv1D_LSTM.h5'
mc6 = ModelCheckpoint(model6_path, verbose=1, save_best_only=True)
es6 = EarlyStopping(patience=5)

In [124]:
hist6 = model6.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=128, callbacks=[mc6, es6]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.89077, saving model to Conv1D_LSTM.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.89077 to 1.59015, saving model to Conv1D_LSTM.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.59015
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.59015
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.59015
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.59015
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.59015


In [125]:
best_model6 = load_model(model6_path)
best_model6.evaluate(X_test, Y_test)







[1.5730806589126587, 0.6769669055938721]

In [126]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [127]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model6,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 79.82271909713745%의 확률로 분쟁유발에 대한 악플입니다.
[2.6514418e-03 1.0753199e-02 5.2382902e-04 4.7517363e-03 2.5320426e-04
 8.5379466e-02 1.3521438e-03 1.4312087e-02 8.1774481e-02 2.1191274e-05
 7.9822719e-01]
None


### Conv1D + BiLSTM

In [37]:
from tensorflow.keras.layers import MaxPooling1D

model7 = Sequential()
model7.add(Embedding(vocab_size, 512, input_length=max_len))
model7.add(Conv1D(32, 3, activation = 'relu'))
model7.add(MaxPooling1D(2))
model7.add(Conv1D(32, 3, activation = 'relu'))
model7.add(Bidirectional(LSTM(32, return_sequences=True)))
model7.add(GlobalMaxPooling1D())
model7.add(Dense(11,activation='softmax'))

model7.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 70, 512)           18937344  
                                                                 
 conv1d_2 (Conv1D)           (None, 68, 32)            49184     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 34, 32)           0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 32, 32)            3104      
                                                                 
 bidirectional (Bidirectiona  (None, 32, 64)           16640     
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 64)              

In [132]:
model7.compile('adam', 'categorical_crossentropy', ['accuracy'])
model7_path = 'Conv1D_BiLSTM.h5'
mc7 = ModelCheckpoint(model7_path, verbose=1, save_best_only=True)
es7 = EarlyStopping(patience=5)

In [133]:
hist7 = model7.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=100, batch_size=64, callbacks=[mc7, es7]
)

Epoch 1/100
Epoch 00001: val_loss improved from inf to 2.02339, saving model to Conv1D_BiLSTM.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.02339 to 1.89843, saving model to Conv1D_BiLSTM.h5
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.89843
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.89843
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.89843
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.89843
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.89843


In [134]:
best_model7 = load_model(model7_path)
best_model7.evaluate(X_test, Y_test)



[1.8741521835327148, 0.6297194957733154]

In [135]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [136]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model7,tokenizer=t, max_len=max_len))





'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 71.50145769119263%의 확률로 분쟁유발에 대한 악플입니다.
[1.1188831e-02 4.8013989e-04 4.1934536e-06 6.7986222e-04 1.1881140e-01
 4.0029157e-03 4.0260304e-04 8.7737096e-03 1.4064173e-01 7.4602056e-08
 7.1501458e-01]
None


### GRU

In [38]:
from tensorflow.keras.layers import GRU

In [160]:
inputs = Input(shape=(max_len,))

model8 = Sequential()

model8.add(Embedding(vocab_size, 512, input_length=max_len))
model8.add(GRU(128))
model8.add(Dense(11, activation='softmax'))

model8.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_30 (Embedding)    (None, 70, 512)           18937344  
                                                                 
 gru_5 (GRU)                 (None, 128)               246528    
                                                                 
 dense_30 (Dense)            (None, 11)                1419      
                                                                 
Total params: 19,185,291
Trainable params: 19,185,291
Non-trainable params: 0
_________________________________________________________________


In [152]:
model8.compile('adam', 'categorical_crossentropy', ['accuracy'])
model8_path = 'GRU.h5'
mc8 = ModelCheckpoint(model8_path, verbose=1, save_best_only=True)
es8 = EarlyStopping(patience=5)

In [153]:
hist8 = model8.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=30, batch_size=32, callbacks=[mc8, es8]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 1.74823, saving model to GRU.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.74823 to 1.71116, saving model to GRU.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 1.71116
Epoch 4/30
Epoch 00004: val_loss did not improve from 1.71116
Epoch 5/30
Epoch 00005: val_loss did not improve from 1.71116
Epoch 6/30
Epoch 00006: val_loss did not improve from 1.71116
Epoch 7/30
Epoch 00007: val_loss did not improve from 1.71116


In [154]:
best_model8 = load_model(model8_path)
best_model8.evaluate(X_test, Y_test)



[1.7077429294586182, 0.6493355631828308]

In [44]:
import re

def sentiment_predict(review, best_model,tokenizer=t, max_len=max_len):
    review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣]',' ',review).strip()
    morphs = mecab.morphs(review)
    morphs = [word for word in morphs if word not in stopwords]
    encoded = tokenizer.texts_to_sequences([morphs])
    padded = pad_sequences(encoded, maxlen=max_len)
    score = best_model.predict(padded)
    class_text = smile_train.columns[1:]

    return print(f"'{review}'\n {score[0][score.argmax()]*100}%의 확률로 {class_text[score.argmax()]}에 대한 악플입니다.\n{score[0]}")

In [158]:
print(sentiment_predict('종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ', best_model8,tokenizer=t, max_len=max_len))

'종북좌파 빨갱이 새끼들이 하는 짓이 그렇지 뭐 ㅋㅋ'
 87.47551441192627%의 확률로 분쟁유발에 대한 악플입니다.
[6.29698741e-04 1.57084403e-04 6.16414468e-07 9.50239692e-03
 1.49178095e-02 4.33272729e-03 1.20026169e-04 9.65251180e-04
 9.46192592e-02 7.63067554e-10 8.74755144e-01]
None


In [38]:
inputs = Input(shape=(max_len,))

model9 = Sequential()

model9.add(Embedding(vocab_size, 512, input_length=max_len))
model9.add(Conv1D(32, 5, activation = 'relu'))
model9.add(GRU(32))
model9.add(Dense(11, activation='softmax'))

model9.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 70, 512)           18937344  
                                                                 
 conv1d (Conv1D)             (None, 66, 32)            81952     
                                                                 
 gru_2 (GRU)                 (None, 32)                6336      
                                                                 
 dense_2 (Dense)             (None, 11)                363       
                                                                 
Total params: 19,025,995
Trainable params: 19,025,995
Non-trainable params: 0
_________________________________________________________________


In [39]:
model9.compile('adam', 'categorical_crossentropy', ['accuracy'])
model9_path = 'Conv1D_GRU.h5'
mc9 = ModelCheckpoint(model9_path, verbose=1, save_best_only=True)
es9 = EarlyStopping(patience=5)

In [40]:
hist9 = model9.fit(
    X_train, Y_train, validation_split=0.2,
    epochs=100, batch_size=64, callbacks=[mc9, es9]
)

Epoch 1/100


2022-06-15 15:46:26.096657: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8101
2022-06-15 15:46:26.563361: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-06-15 15:46:27.108480: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 00001: val_loss improved from inf to 1.45646, saving model to Conv1D_GRU.h5
Epoch 2/100
Epoch 00002: val_loss did not improve from 1.45646
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.45646
Epoch 4/100
Epoch 00004: val_loss did not improve from 1.45646
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.45646
Epoch 6/100
Epoch 00006: val_loss did not improve from 1.45646


In [42]:
best_model9 = load_model(model9_path)
loss9, acc9 = best_model9.evaluate(X_test, Y_test)
loss9, acc9



(1.4380638599395752, 0.688356876373291)

In [60]:
print(sentiment_predict('안녕 개새끼들아', best_model9,tokenizer=t, max_len=max_len))

'안녕 개새끼들아'
 39.093783497810364%의 확률로 악플/욕설에 대한 악플입니다.
[0.01603041 0.01596536 0.1322199  0.06223116 0.01985264 0.01034814
 0.0550108  0.0062829  0.39093783 0.2863286  0.00479228]
None
