## Load Data and Library

In [1]:
#!pip3 install --upgrade tensorflow-gpu --user
#!pip3 install git+https://github.com/huggingface/transformers
#!conda install -c huggingface transformers

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import os
import tqdm

from konlpy.tag import Okt

import sklearn
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import log_loss, accuracy_score, f1_score
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import transformers
#from transformers import BertTokenizer
#from transformers.models.bert.modeling_bert import BertModel,BertForMaskedLM


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
train=pd.read_csv('../data/train.csv')
test=pd.read_csv('../data/test.csv')
sample_submission=pd.read_csv('../data/sample_submission.csv')

In [4]:
print(f'train.shape:{train.shape}')
print(f'test.shape:{test.shape}')
print(f'train label 개수: {train.label.nunique()}')

train.shape:(174304, 13)
test.shape:(43576, 12)
train label 개수: 46


## Data Preprocessing

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [6]:
#Include not only the title of the research but abstract contents to train a model
train = train[['과제명','요약문_연구내용','label']]
test = test[['과제명','요약문_연구내용']]
train['요약문_연구내용'].fillna('NAN', inplace=True)
test['요약문_연구내용'].fillna('NAN', inplace=True)

In [7]:
train['data'] = train['과제명']+train['요약문_연구내용']
test['data'] = test['과제명']+test['요약문_연구내용']

In [8]:
print(train.shape)
print(test.shape)

(174304, 4)
(43576, 3)


In [9]:
train.head(2)

Unnamed: 0,과제명,요약문_연구내용,label,data
0,유전정보를 활용한 새로운 해충 분류군 동정기술 개발,(가) 외래 및 돌발해충의 발생조사 및 종 동정\n\n\n ○ 대상해충 : 최...,24,유전정보를 활용한 새로운 해충 분류군 동정기술 개발(가) 외래 및 돌발해충의 발생조...
1,대장암의 TRAIL 내성 표적 인자 발굴 및 TRAIL 반응 예측 유전자 지도 구축...,1차년도\n1) Microarray를 통한 선천적 TRAIL 내성 표적 후보 유전자...,0,대장암의 TRAIL 내성 표적 인자 발굴 및 TRAIL 반응 예측 유전자 지도 구축...


In [10]:
test.head(2) #stop words is required as appear

Unnamed: 0,과제명,요약문_연구내용,data
0,R-FSSW 기술 적용 경량 차체 부품 개발 및 품질 평가를 위한 64채널 C-SC...,○ 1차년도\n\n . 개발 탐촉 시스템의 성능 평가 위한 표준 시편 제작 시...,R-FSSW 기술 적용 경량 차체 부품 개발 및 품질 평가를 위한 64채널 C-SC...
1,다입자계를 묘사하는 편미분방정식에 대한 연구,연구과제1. 무한입자계의 동역학 / 작용소(operator) 방정식에 대한 연구\n...,다입자계를 묘사하는 편미분방정식에 대한 연구연구과제1. 무한입자계의 동역학 / 작용...


## Data Modeling

In [12]:
#random seed 고정
import tensorflow as tf
tf.random.set_random_seed(99)
np.random.seed(99)
BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN=300

In [14]:
from transformers import *
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cache_dir='bert_ckpt', do_lower_case=False)

NameError: name 'BertTokenizer' is not defined

In [None]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = toeknizer.encode_plus(
    text = sent,
    add_special_tokens = True,
    max_length = MAX_LEN,
    pad_to_max_length = True,
    return_attention_mask = True,
    truncation = True)
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['toekn_type_ids']
    
    return input_id, attention_mask, toekn_type_id


def clean_text(sent):
    sent_clean=re.sub("[^가-힣ㄱ-하-ㅣ]", " ", sent)
    return sent_clean


In [None]:
input_ids = []
attention_masks = []
token_type_ids = []
train_date_labels = []

for train_sent, train_label in zip(train['data'], train['label']):
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(clean_text(train_sent), MAX_LEN=MAX_LEN)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)
        
    except Exception as e:
        print(e)
        print(train_sent)
        pass

train_input_ids = np.array(input_ids, dtype=int)
train_attention_masks = np.array(attention_masks, dtype=int)
train_token_type_ids = np.array(token_type_ids, dtype=int)

train_inputs=(train_inputs_ids, train_attention_masks, train_token_type_ids)
train_labels = np.asarray(train_data_labels, dtype=np.int32)

In [None]:
print(train_input_ids[1])
print(train_attention_masks[1])
print(train_token_type_ids[1])
print(tokenizer.decode(train_input_ids[1]))

In [None]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()
        
        self.bert = TFBertModel.from_pretrained(model_name, cache=dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                               name="classifier")
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        
        return logits

In [None]:
cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased', dir_path='bert_ckpt', num_class=46)

In [None]:
#Preparation for training
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimzer=optimizer, loss=loss, metrics=[metric])

model_name = "tf2_bert_classifier"

#Prevent overfitting with earlystop feature
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=5)

checkpoint_path = os.path.join(model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

#Create path if exists
if os.path.exists(checkpoint_dir):
    print("{}--Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{}--Folder create complete \n".format(checkpoint_dir))

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

#Starting training and evaluation
history=cls_model.fit(train_inputs, train_labels, epochs=30, batch_size=32,
                     validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

In [None]:
input_ids=[]
attention_masks=[]
token_type_ids=[]
train_data_labels=[]

for test_sent in test['data']:
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(clean_text(test_sent), MAX_LEN=40)
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        
    except Exception as e:
        print(e)
        print(test_sent)
        pass

test_input_ids = np.array(input_ids, dtype=int)
test_attention_masks = np.array(attention_masks, dtype=int)
test_token_type_ids = np.array(token_type_ids, dtype=int)
test_inputs = (test_input_ids, test_attention_masks, test_token_type_ids)

In [None]:
results = cls_model.predict(test_inputs)
resuts = tf.argmax(results, axis=1)

In [None]:
sample_submission['label']=results
sample_submission

In [None]:
sample_submission.to_csv('bert_baseline.csv', index=False)