In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Data Load

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from konlpy.tag import Hannanum
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model

In [None]:
import os
data_root = "/content/drive/MyDrive/공모전/BOAZ_KED 공모전/KEDxBOAZ/"
traindata_route = os.path.join(data_root, 'train_dfC.csv')
traindata = pd.read_csv(traindata_route, index_col = 0)

### Data tokenizing

In [None]:
def data_tokenizing(data, length, code, all = True):
    """
    data -> dataframe의 형태
    length -> int
    length 매개변수는 최대 사용할 문자열의 길이의 설정을 위해서이다.
    code 매개변수는 대/중/소/세/세세 분류 중에 어떤 것을 예측할지 설정해 주는 용도이다.
    """
    CODE = {'대분류' : 1, '중분류' : 3, '소분류' : 4, '세분류' : 5, '대소분류':4}

    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['BZ_PPOS_ITM_CTT'].astype(str))
    
    train_seq = tokenizer.texts_to_sequences(data['BZ_PPOS_ITM_CTT'].astype(str))
    word_vocab = tokenizer.word_index
    
    MAX_SEQ_LENGTH = length # 사용할 문자열의 최대 길이
    
    train_inputs = pad_sequences(train_seq, maxlen = MAX_SEQ_LENGTH, padding = 'pre')
    
    #from sklearn.preprocessing import LabelEncoder

    #encoder = LabelEncoder()
    label_size = CODE[code]
    if all == True:
      train_labels = np.array(list(map(lambda x : str(x)[:label_size], data.index.values)))
    else:
      if code == '중분류':
        train_labels = np.array(list(map(lambda x: str(x)[1:3], data.index.values)))
      elif code == '소분류':
        train_labels = np.array(list(map(lambda x: str(x)[3]), data.index.values))
      elif code == '세분류': 
        train_labels = np.array(list(map(lambda x: str(x)[4], data.index.values)))
      elif code == '대소분류':
        train_labels = np.array(list(map(lambda x: str(x)[0] + str(x)[3] if len(x) > 4 else str(x)[0], data.index.values)))
      # elif code == '소세세':
      #   train_labels = np.array(list(map(lambda x: x[3:5] if len(x) > 4 else (''), data.index.values)))
))
    
    # 데이터의 정보를 담고 있는 dictionary형의 자료
    data_configs = {} 
    data_configs['vocab'], data_configs['vocab_size'] = word_vocab, len(word_vocab)+1
    
    """
    train_inputs -> 토큰화가 진행되고 padding 또한 진행된 데이터를 반환
    -> 모델에 입력할 수 있는 형태
    train_labels -> 수치형 데이터로 예측해야하는 업종 코드를 반환 (One-Hot Encoding된 형태로 반환 -> 모델에서 categorical_crossentropy로 학습 시킬 수 있도록)
    data_configs -> 단어 사전과 단어의 총 개수를 dictionary의 형태로 입력된 데이터를 반환
    """
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(sparse = False)
    train_labels = enc.fit_transform(train_labels.reshape(-1, 1))
    data_configs['raw_labels'] = enc.categories_
    
    return train_inputs, train_labels, data_configs, enc

In [None]:
all = False # 대분류 시 all=True 로 변경
category = '중분류'
train_inputs, train_labels, data_configs, label_encoder = data_tokenizing(traindata, 150, category, all)

### MODEL

#### Self Attention
1. positional encoding은 embedding vector에 단어의 문장에서의 위치에 대한 정보를 제공하기 위해서 주어진다.
2. positional encoding을 추가한 이후에는 의미의 similarity와 문장에서의 위치가 반영이 가능하게 될 것이다.

#### Embedding
1. 먼저 positional encoding vector을 만들어야 한다.
  - 문장 내에서 단어의 상대적인 의미를 제공해 줄 것이다.

#### POSITIONAL ENCODING

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1/np.power(10000, (2*i//np.float32(d_model)))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :], d_model)
  
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype = tf.float32)

#### MASK PADDING
- 문자열의 모든 batch들에 대해서 padding된 token을 masking한다.
- model이 padding된 부분을 **input으로 고려할 수 없도록**한다.
- mask에 의해서 pad value가 존재하면 0, 존재하지 않으면 1로 설정이 되도록 한다.

In [None]:
def create_padding_masks(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding to the attention logits
  return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_length)

#### SCALED DOT PRODUCT ATTENTION
-  Attention(Q, K, V) = softmax(Q*transpose(K) / d_model의 제곱근) x V

In [None]:
def scaled_dot_product_attention(q, v, k, mask):
  """
  @param q : query (.., seq_length_q, depth(=dmodel))
  @param v : value (.., seq_length_v, depth(=dmodel))
  @param k : key (.., seq_length_k, depth_v)
  @param mask : (.., seq_length_q, seq_length_k)
  """

  qk_matmul = tf.matmul(q, k, transpose_b = True)

  # dk = dimension of queries ans key
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_att_logits = qk_matmul / tf.math.sqrt(dk)

  if mask is not None:
    scaled_att_logits += (mask * -1e9)
  
  att_weights = tf.nn.softmax(scaled_att_logits, axis = -1)

  output = tf.matmul(att_weights, v)

  return output, att_weights



#### MULTI HEAD ATTENTION
- Linear Layers and Splits into Heads
- Scaled Dot-Product Attention
- Concatenation of Heads
- Final Linear Layer

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    '''
    d_model : dimension을 알려주는, 즉 Attention을 연산할 때 key, query, value에 대해서 차원을 정의하기 위해 사용
    num_heads : Attention의 head의 개수를 정의하기 위한 parameter
    이 값들을 따로 kwargs에 저장할까 하다가 복잡해 질것 같아서 config.yaml 파일을 만들기로
    '''
    super(MultiHeadAttention, self).__init__()
# 차원의 수는 head의 수만큼 나누어져야 하므로 나누어지지 않을 시에 Error발생
    # assert d_model % num_heads == 0
    
    self.num_heads = num_heads
    self.d_model = d_model

    self.depth = d_model

    self.fcq = tf.keras.layers.Dense(d_model) # query의 차원만큼 Dense unit 설정
    self.fcv = tf.keras.layers.Dense(d_model) # value
    self.fck = tf.keras.layers.Dense(d_model) # key

    self.dense = tf.keras.layers.Dense(d_model)
  
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.fcq(q)
    k = self.fck(k)
    v = self.fcv(v)

    q = self.split_heads(q, batch_size)
    k = self.split_heads(k, batch_size)
    v = self.split_heads(v, batch_size)

    scaled_attention, attention_w = scaled_dot_product_attention(q,k,v,mask) # attention_w는 여기서는 필요 없는 값

    scaled_attention = tf.transpose(scaled_attention, perm = [0,2,1,3])

    concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

    output = self.dense(concat_attention)

    return output, attention_w
  
  def split_heads(self, x, batch_size):
    """
    key, query, value의 값들에 대한 벡터를 각각의 head의 개수만큼 나눌 수 있도록 함
    (batch_size, len_seq, depth) -> (batch_size, num_head, sequence, feature_size)
    마지막 차원을 (num_heads, depth)로 변경해 줌
    결과를 (batch_size, num_heads, seq_len, depth)로 transpose 해 줌
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    x = tf.transpose(x, perm = [0,2,1,3])

    return x

#### ENCODER LAYER

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate = 0.1):
    # positional encoding한 출력값에 적용한 dropout rate는 0.1 (논문에 의하면)
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

    self.do1 = tf.keras.layers.Dropout(rate)
    self.do2 = tf.keras.layers.Dropout(rate)
  
  def call(self, x, training, mask):
    att_output, _ = self.mha(x, x, x, mask = None) # (batch_size, input_seq_length, d_model)
    att_output = self.do1(att_output, training) 
    out1 = self.layernorm1(x + att_output) 

    ffn_output = self.ffn(out1) # (batch_size, input_seq_length, d_model)
    ffn_output = self.do2(ffn_output, training)
    out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_length, d_model)

    return out2 # (batch_size, input_seq_length, d_model)

#### POINT WISE FEED FORWARD NETWORK
1. two fully-connected layers
2. ReLU activation function을 두개의 완전 연결 층의 사이에 둔 network이다.

In [None]:
def point_wise_feed_forward(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation = 'relu'), # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model) # (batch_size, seq_len. d_model)
  ])

#### ENCODER
1. Input Embedding
2. Positional Encoding
3. N encoder layers
- 입력된 텍스트 데이터는 Embedding 층을 거쳐서 positional encoding과 결합이 된다.
- 결합된 데이터는 encoder layer의 입력값으로 반환이 된다.
- Encoder는 최종적으로 **word embedding과 position embedding 정보를 받아 input text에 대한 context information을 출력**한다.
- 출력 데이터의 shape는 **(batch_size, input_seq_length, d_model)**이다

### Encoder class

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_voca_size, 
               max_pos_encoding, rate = 0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    self.num_heads = num_heads
    self.dff = dff # forward 층에 대해서 unit의 크기를 의미
    self.embedding = tf.keras.layers.Embedding(input_voca_size, d_model, input_length = 150) # (batch_size, input_length, d_model)
    self.pos_encoding = positional_encoding(max_pos_encoding, d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(self.num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x):
    x = self.embedding(x) # tf.Tensor
    # x = tf.Session().run(x) 
    # x = x.numpy()
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) # tf.Tensor
    x += self.pos_encoding[:, :tf.shape(x)[1], :] # positional encoding의 경우에는 input embedding과 차원의 크기를 d_model로 동일하게 설정

    x = self.dropout(x)

    #for layers in self.enc_layers:
     # x = layers(x,mask = None)
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, mask = None)
    
    return x

### CLASSIFIER

In [None]:
class Classifier(tf.keras.Model):
  def __init__(self, enc_configs, data_configs, **kwargs):
    """
    @param encoder : 객체 입력
    @param data_configs : data에 대한 정보가 dictionary의 형태로 저장
    """
    super(Classifier, self).__init__()
    # enc_configs['add_input']에 저장된 값이 None이 아닌 이상 입력으로 같이 주어져야 함
    # 모델을 이런식으로 학습시켰다면 예측 또한 같은 방법으로 시켜야 함
    

    self.num_layers = enc_configs['num_layers']

    self.added_input = enc_configs['add_input'] # pre-label 입력
    self.type = enc_configs['type'] # 
    self.raw_labels = data_configs['raw_labels'][0]
    self.encoder =  Encoder(num_layers = enc_configs['num_layers'], 
                            d_model = enc_configs['d_model'], num_heads = enc_configs['num_heads'], 
                    dff = enc_configs['dff'], max_pos_encoding = enc_configs['num_heads'], 
                    input_voca_size = data_configs['vocab_size'])
    """
    출력 shape는 (batch_size, input_seq, d_model)
    Dense Layer에 입력하기 위해서 Squeeze등의 과정은 불필요
    """
    # shared_dense_args = {
    #  "units" : , "num_layers" : ,
    #   "dropout" : ,
    # }
    #self.flatten = tf.keras.layers.Flatten()
    self.gru = tf.keras.layers.GRU(512, activation = 'relu')

    self.main_layer = [
      tf.keras.layers.Dense(units = 512, activation = 'relu'),
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.Dropout(0.3)
    ]

    self.fin = tf.keras.layers.Dense(len(self.raw_labels), activation = 'softmax')
  
  def return_logits(self, x):
    raw_labels = data_configs['raw_labels'][0]
    logits = []
    x = self.encoder(x)
    x = self.flatten(x)
    for layer in main_layer:
      x = layer(x)
      for i in x:
        answer_idx = tf.argmax(i, axis = -1)
        logits.append(raw_labels[answer_idx])
    
    return logits

  def merge_inputs(self, type, x):
    if self.type != None:
      add_input = self.added_input

    else:
      return x

  def call(self, x):
    x = self.encoder(x) # (batch_size, input_seq_len, d_model)
    #x = self.flatten(x)
    x = self.merge_inputs(self.type, x)

    x = self.gru(x)
    for _ in range(self.num_layers):
      for layers in self.main_layer:
        x = layers(x)
    
    result = self.fin(x)
    #result = self.return_logits(result)
    
    return result

### HYPERPARAMETER

In [None]:
NUM_LAYERS = 4 #
D_MODEL = 150 # embedding vector의 차원과 동일
DFF = 512
NUM_HEADS = 150 #
INPUT_VOCA_SIZE = data_configs['vocab_size']
BATCH_SIZE = 256
EPOCH = 20

enc_configs = {}

enc_configs['num_layers'] = NUM_LAYERS
enc_configs['d_model'] = D_MODEL
enc_configs['num_heads'] = NUM_HEADS
enc_configs['dff'] = DFF
enc_configs['max_pos_encoding'] = 150 # max_pos_encoding
enc_configs['add_input'] = None
enc_configs['type'] = None

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_inputs, train_labels, test_size = 0.3)

1. Optimizer

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-4, beta_1 = 0.9, beta_2 = 0.98, epsilon = 10**(-9))

2. Loss

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy()

3. Training and Checkpointing


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
earlystopping = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=5, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
)

## Model train

In [None]:
with tf.device('/device:GPU:0'):
  Model = Classifier(enc_configs = enc_configs, data_configs = data_configs)
  Model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
  history = Model.fit(x_train, y_train,  validation_split = 0.3, epochs = EPOCH,  batch_size = BATCH_SIZE, callbacks = earlystopping)

#### Train 결과 시각화 & evaluate

In [None]:
import matplotlib.pyplot as plt
def merge_plot(history):
  fig, loss_ax = plt.subplots()
  acc_ax = loss_ax.twinx()

  loss_ax.plot(history.history['loss'], 'y', label  = 'loss')
  loss_ax.plot(history.history['val_loss'], 'r', label = 'val_loss')
  loss_ax.set_xlabel('epoch')
  loss_ax.set_ylabel('loss')
  loss_ax.legend(loc = 'upper left')

  acc_ax .plot(history.history['accuracy'], 'b', label = 'accuracy')
  acc_ax.plot(history.history['val_accuracy'], 'g', label = 'val_accuracy')
  acc_ax.set_ylabel('accuracy')
  acc_ax.legend(loc = 'upper left')

In [None]:
merge_plot(history)

In [None]:
Model.evaluate(x_test, y_test)

### 예측기 저장

In [None]:
import os
modelpath = "/content/drive/MyDrive/공모전/BOAZ_KED 공모전/MODELS"
Model.save(os.path.join(modelpath, 'Model'))



INFO:tensorflow:Assets written to: /content/drive/MyDrive/KEDxBOAZMY/MODELS/Model_A/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/KEDxBOAZMY/MODELS/Model_A/assets


In [None]:
Model.summary()

Model: "classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            multiple                  164956148 
_________________________________________________________________
gru (GRU)                    multiple                  1019904   
_________________________________________________________________
dense_24 (Dense)             multiple                  262656    
_________________________________________________________________
batch_normalization (BatchNo multiple                  2048      
_________________________________________________________________
dropout_9 (Dropout)          multiple                  0         
_________________________________________________________________
dense_25 (Dense)             multiple                  40014     
Total params: 166,280,770
Trainable params: 166,279,746
Non-trainable params: 1,024
______________________________________

### 4. Test Data에 대한 예측 도출

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os

modelpath = "/content/drive/MyDrive/공모전/BOAZ_KED 공모전/MODELS"
def load_model(model_name):
  model_path = os.path.join(modelpath, str(model_name))
  model_A = tf.keras.models.load_model(model_path)

  return model_A

In [None]:
Model_T = load_model('Model')



### Predict

In [None]:
def test(model, testdata, pad_len, encoder, category):
    # plus : 필요없음
    test_inputs, test_labels, testdata_configs, plus = data_tokenizing(testdata, pad_len, category)
    y_pred = model.predict(test_inputs)
    y_pred = encoder.inverse_transform(y_pred)

    # 예측 결과 dataframe으로 반환
    y_pred = pd.DataFrame(data=y_pred, index=testdata.index.values, columns=['모델이 예측한 업종코드'])

    return y_pred

In [None]:
def to_format(path, y_pred, testdata):
    # path : 최종 결과 저장할 위치 
    # y_pred : 예측 결과 dataframe
    # testdata : testdata
    data_root = "/content/drive/MyDrive/공모전/BOAZ_KED 공모전/KEDxBOAZ"
    save_route = os.path.join(data_root, '[공모전]정답제출양식_(모동숙)_210523.xlsx')

    format = pd.read_excel(path, index_col=0)

    final_df = y_pred.reindex(format.index.values)
    final_df = final_df.reset_index().rename(columns={"index": "KEDCD"})
    final_df.to_excel(save_route, index=False)

TEST 데이터 준비

In [None]:
import os
data_root = "/content/drive/MyDrive/공모전/BOAZ_KED 공모전/KEDxBOAZ"
testdata_route = os.path.join(data_root, 'test_df.csv')
testdata = pd.read_csv(testdata_route, index_col = 0)

In [None]:
# label_encoder : train data tokenizing시 label encoder
# category = '대분류'
category = '중분류' 
# category = '소분류'
# category = '세분류'
# category = '대소분류'

y_pred = test(Model_T, testdata, 150, label_encoder, category)

In [None]:
submission_path = os.path.join(data_root, '[공모전]정답제출양식_(조)_제출일자.xlsx')
to_format(submission_path, y_pred, testdata)

----------------------