In [1]:
!pip install soyspacing 


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm import tqdm
from konlpy.tag import Mecab
from soyspacing.countbase import CountSpace
from tensorflow.python.client import device_lib
from sklearn.model_selection import train_test_split

from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [4]:
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 7988423307366511196
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 22718447616
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 17481597131432050140
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:02:00.0, compute capability: 8.6"
 xla_global_id: 416903419]

### **데이터 읽기**

In [5]:
train = pd.read_csv('./train.csv').iloc[:, 1:]
test = pd.read_csv('./test.csv').iloc[:, 1:]
submission = pd.read_csv('./sample_submission.csv')

In [6]:
train.head()

Unnamed: 0,category,data
0,2,신혼부부위한 주택정책 보다 보육시설 늘려주세요.. 국민세금으로 일부를 위한 정책펴지...
1,0,학교이름에 '남자'도 붙여주세요. 울산여자중학교에 재학중인 학생입니다 최근 양성평등...
2,1,"빙상연맹, 대한축구협회등 각종 체육협회의 비리를 철저하게 밝혀주세요.. 최근 동계올..."
3,1,"티비 12세,15세 관람가도 연령확인 의무화 하자.. 제기 에전에 티비를 보다가 잠..."
4,1,무더운 여름철엔 남성들도 시원한 자율복장을 해야. 무더운 여름철에는 남성들도 노넥타...


### **전처리**

In [7]:
train = train.dropna(how='any')

In [8]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','에서', '만', '뿐', '조차', '마저', '까지', '와','한','하다','을']

In [9]:
train['data'] = train['data'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', regex=True)
test['data'] = test['data'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', regex=True)

In [10]:
cleaned_train_text = [word.strip() for word in train['data'] if not word in stopwords]
cleaned_test_text = [word.strip() for word in test['data'] if not word in stopwords]

### **토큰화**

In [11]:
mecab = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")

In [12]:
X_train = []

for sentence, i in zip(train['data'], tqdm(range(len(train['data'])))) :
    temp_X = []
    temp_X = mecab.nouns(sentence)
    temp_X = [word for word in temp_X if not word in stopwords]
    X_train.append(temp_X)

100%|█████████████████████████████████████████████████████████████████████████▉| 39991/39992 [00:22<00:00, 1744.43it/s]


In [13]:
X_test = []

for sentence, i in zip(test['data'], tqdm(range(len(test['data'])))) :
    temp_X = []
    temp_X = mecab.nouns(sentence)
    temp_X = [word for word in temp_X if not word in stopwords]
    X_test.append(temp_X)

100%|███████████████████████████████████████████████████████████████████████████▉| 4999/5000 [00:02<00:00, 1746.40it/s]


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [16]:
VOCAB_SIZE = 30000 # 39992개 단어 중 출현 빈도가 상위 VOCAB_SIZE개에 속하는 것만 사용하도록 설정
MAX_LEN = 350

tokenizer = Tokenizer(VOCAB_SIZE)
tokenizer.fit_on_texts(X_train) # fit_on_texts: 문자 데이터를 입력받아 리스트 형태로 반환

In [18]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [19]:
X_train = pad_sequences(X_train, maxlen=MAX_LEN)
X_test = pad_sequences(X_test, maxlen=MAX_LEN)

In [20]:
X_train

array([[   0,    0,    0, ...,   72, 1776, 7064],
       [   0,    0,    0, ...,    7,   18,  131],
       [   0,    0,    0, ...,  187, 1013,    2],
       ...,
       [   0,    0,    0, ...,    4,  449,   33],
       [   0,    0,    0, ...,    2,   50,  307],
       [   0,    0,    0, ...,  189,  933,  327]])

In [21]:
y_train = to_categorical(train['category'])

In [22]:
y_train

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

### **모델링(Transformer)**

데이콘 

In [30]:
path = './model'

In [31]:
class MultiHeadAttention(Layer):
    def __init__(self, embedding_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim # d_model
        self.num_heads = num_heads

        assert embedding_dim % self.num_heads == 0

        self.projection_dim = embedding_dim // num_heads
        self.query_dense = Dense(embedding_dim)
        self.key_dense = Dense(embedding_dim)
        self.value_dense = Dense(embedding_dim)
        self.dense = Dense(embedding_dim)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embedding_dim' : self.embedding_dim,
            'num_heads' : self.num_heads,
            
            'projection_dim' : self.projection_dim,
            'query_dense' : self.query_dense,
            'key_dense' : self.key_dense,
            'value_dense' : self.value_dense,
            'dense' : self.dense
        })

    def scaled_dot_product_attention(self, query, key, value):
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        logits = matmul_qk / tf.math.sqrt(depth)
        attention_weights = tf.nn.softmax(logits, axis=-1)
        output = tf.matmul(attention_weights, value)
        return output, attention_weights

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]

        # (batch_size, seq_len, embedding_dim)
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        # (batch_size, num_heads, seq_len, projection_dim)
        query = self.split_heads(query, batch_size)  
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention, _ = self.scaled_dot_product_attention(query, key, value)
        # (batch_size, seq_len, num_heads, projection_dim)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  

        # (batch_size, seq_len, embedding_dim)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_dim))
        outputs = self.dense(concat_attention)
        return outputs

In [32]:
class TransformerBlock(Layer):
    def __init__(self, embedding_dim, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = Sequential(
            [Dense(dff, activation="relu"),
             Dense(embedding_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'att' : self.att,
            'ffn' : self.ffn,
            'layernorm1' : self.layernorm1,
            'layernorm2' : self.layernorm2,
            'dropout1' : self.dropout1,
            'dropout2' : self.dropout2
        })

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [33]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(vocab_size, embedding_dim)
        self.pos_emb = Embedding(max_len, embedding_dim)
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'token_emb' : self.token_emb,
            'pos_emb' : self.pos_emb,
        })
        return config

    def call(self, x):
        max_len = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [34]:
embedding_dim = 32  # Embedding size for each token
num_heads = 4  # Number of attention heads
dff = 32  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(MAX_LEN,))
embedding_layer = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(8, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(3, activation="softmax")(x)

In [35]:
model = Model(inputs=inputs, outputs=outputs)

In [36]:
ckpt_1 = 'tf_chkpoint.ckpt'

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

mc = ModelCheckpoint(filepath =  os.path.join(path, ckpt_1), monitor = 'val_accuracy', save_best_only = True, mode = 'max',verbose = 1, save_weights_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=20, min_delta=0.00001)

In [37]:
with tf.device('/device:GPU:0'):
    history = model.fit(X_train, y_train, batch_size=128, epochs=, validation_split=0.2, callbacks = [mc, early_stopping])

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.87148, saving model to ./model\tf_chkpoint.ckpt
Epoch 2/100

Epoch 00002: val_accuracy did not improve from 0.87148
Epoch 3/100

Epoch 00003: val_accuracy improved from 0.87148 to 0.87923, saving model to ./model\tf_chkpoint.ckpt
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.87923
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.87923
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.87923
Epoch 7/100

Epoch 00007: val_accuracy did not improve from 0.87923
Epoch 8/100

Epoch 00008: val_accuracy did not improve from 0.87923
Epoch 9/100

Epoch 00009: val_accuracy did not improve from 0.87923
Epoch 10/100

Epoch 00010: val_accuracy did not improve from 0.87923
Epoch 11/100

Epoch 00011: val_accuracy did not improve from 0.87923
Epoch 12/100

Epoch 00012: val_accuracy did not improve from 0.87923
Epoch 13/100

Epoch 00013: val_accuracy did not improve from 0.87923
Epoch 14/100

Epoch 000

In [38]:
y_pred = model.predict(X_test)

In [39]:
submission['category'] = np.argmax(y_pred, axis=-1)
submission.to_csv('./submission.csv', encoding='utf-8', index=False)