In [1]:
import tensorflow as tf
import os

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

<tensorflow.python.tpu.topology.Topology at 0x7c1aa72db730>

In [2]:
strategy = tf.distribute.TPUStrategy(resolver)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
# import wandb
# from wandb.keras import WandbCallback
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
base_path = "/content/drive/MyDrive"

In [None]:
train_data = pd.read_csv(base_path + "/dktc/data/train.csv")

In [6]:
augment_data = pd.read_csv(base_path + "/aug_data.csv")
augment_data.head()

Unnamed: 0.1,Unnamed: 0,class,conversation
0,0,0,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,0,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,3,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,1,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,1,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [7]:
len(augment_data)

18958

In [8]:
CLASS_NAMES = ['협박 대화', '갈취 대화', '직장 내 괴롭힘 대화', '기타 괴롭힘 대화']

class_dict = {}
for idx, class_name in enumerate(CLASS_NAMES):
    class_dict[class_name] = idx

train_data['class'] = train_data['class'].apply(lambda s: class_dict[s])

train_data.head()


In [9]:
train_data = augment_data.drop_duplicates(subset=["class", "conversation"])

In [10]:
corpus = train_data["conversation"]
corpus.head()

0    지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1    길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2    너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3    어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4    저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...
Name: conversation, dtype: object

In [11]:
import re

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Zㄱ-ㅎ가-힣ㅏ-ㅣ?.!,]+", " ", sentence)

    sentence = sentence.strip()

    return sentence

In [12]:
corpus = [preprocess_sentence(s) for s in corpus]

In [13]:
model_paths = ["monologg/kobigbird-bert-base","kykim/bert-kor-base", "klue/bert-base"]


In [14]:
X_train, X_val, y_train, y_val = train_test_split(corpus, train_data['class'].tolist(), test_size=0.2)

In [15]:
test_data = pd.read_json(base_path + "/dktc/data/test.json")
test_data = test_data.transpose()

In [16]:
from transformers import BertTokenizerFast, TFBertForSequenceClassification

predicted_dict = {}

for HUGGINGFACE_MODEL_PATH in model_paths:
    print()
    print('##########################################')
    print(HUGGINGFACE_MODEL_PATH)
    # Load Tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(HUGGINGFACE_MODEL_PATH)

    # Tokenizing
    train_encodings = tokenizer(X_train, truncation=True, padding=True)
    val_encodings = tokenizer(X_val, truncation=True, padding=True)
    test_encodings = tokenizer(test_data["text"].tolist(), truncation=True, padding=True)

    # # trainset-set
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        y_train
    ))

    # # validation-set
    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        y_val
    ))

    # # test-set
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings)
    ))


    with strategy.scope():
        model = TFBertForSequenceClassification.from_pretrained(HUGGINGFACE_MODEL_PATH, num_labels=4, from_pt=True)

        optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-5)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

    history = []
    batch_size = 32
    for i in range(5):
        model.fit(
            train_dataset.shuffle(1000).batch(batch_size),
            validation_data=val_dataset.shuffle(1000).batch(batch_size),
            # callbacks = [callback_earlystop]
        )

        predicted = model.predict(test_dataset.batch(16))
        history.append(predicted.logits)
    predicted_dict[HUGGINGFACE_MODEL_PATH] = history


##########################################
monologg/kobigbird-bert-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using a model of type big_bird to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.pooler.weight', 'bert.embeddings.position_ids', 'bert.pooler.bias']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- 


##########################################
kykim/bert-kor-base


tokenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



##########################################
klue/bert-base


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




In [17]:
import numpy as np

def softmax(x):
    # Subtracting the max for numerical stability
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)


In [19]:
for HUGGINGFACE_MODEL_PATH in model_paths:
    history = predicted_dict[HUGGINGFACE_MODEL_PATH]
    for i in range(5):
        predicted = softmax(history[i]).argmax(axis=1)
        submission = pd.read_csv(base_path + "/dktc/data/submission.csv")
        submission['class'] = predicted
        model_name = HUGGINGFACE_MODEL_PATH.split("/")[1]
        submission.to_csv(f"{base_path}/dktc/augment-{model_name}-{i}.csv", index=False)

In [None]:
# 앙상블의 효과는 없었다