# install

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# !pip install transformers
# !pip install datasets

In [1]:
import collections
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import default_data_collator
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings(action='ignore')

# data

In [2]:
model = AutoModelForMaskedLM.from_pretrained('skt/kobert-base-v1')
tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')

Some weights of BertForMaskedLM were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
num_parameters = model.num_parameters()
print(f"skt/kobert-base-v1 parameters: {num_parameters}")

skt/kobert-base-v1 parameters: 92196418


In [3]:
dataset = load_dataset("csv", data_files="./data/report_dataset.csv")

Using custom data configuration default-d2549ac2fb5ac5b1
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-d2549ac2fb5ac5b1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
def tokenize_function(examples):
    result = tokenizer(examples['article'])
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [25]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['Unnamed: 0', 'company', 'title', 'article', 'opinion', 'firm', 'date'])

  0%|          | 0/51 [00:00<?, ?ba/s]

In [27]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 50083
    })
})

In [29]:
tokenizer.model_max_length

1000000000000000019884624838656

In [7]:
def group_texts(examples):
    # 모든 텍스트들을 결합
    concatenated_examples = {k:sum(examples[k], []) for k in examples.keys()}
    # 결합된 텍스트들에 대한 길이 구함
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // 512) * 512
    result = {k: [t[i:i+512] for i in range(0, total_length, 512)] for k, t in concatenated_examples.items()}
    result['labels'] = result['input_ids'].copy()
    return result

In [26]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)

  0%|          | 0/51 [00:00<?, ?ba/s]

In [9]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 56081
    })
})

# whole word masking

In [10]:
wwm_probability = 0.2

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # 단어와 해당 토큰 인덱스 간의 map 생성
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # 무작위로 단어 마스킹
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

In [11]:
downsampled_dataset = lm_datasets["train"].train_test_split(test_size=0.1, seed=42)

Loading cached split indices for dataset at /home/piai/.cache/huggingface/datasets/csv/default-d2549ac2fb5ac5b1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-e52ed23127291957.arrow and /home/piai/.cache/huggingface/datasets/csv/default-d2549ac2fb5ac5b1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-a30e36a728b6dccb.arrow


In [12]:
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 50472
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 5609
    })
})

In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
batch_size = 4
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = 'kobert-base-v1'

training_args = TrainingArguments(
    output_dir=f"./models/{model_name}-finetuned-wholemasking",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_total_limit = 3,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    remove_unused_columns=False
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=whole_word_masking_data_collator,
)

/home/piai/hustar/Hustar_Group_4_TeamP/testMH/./models/kobert-base-v1-finetuned-wholemasking is already a clone of https://huggingface.co/minhub7/kobert-base-v1-finetuned-wholemasking. Make sure you pull the latest changes with `repo.git_pull()`.
Using amp half precision backend


In [16]:
trainer.train()

***** Running training *****
  Num examples = 50472
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18927
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mminhub7[0m. Use [1m`wandb login --relogin`[0m to force relogin


../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [24,0,0], thread: 

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1350, in forward
    outputs = self.bert(
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1017, in forward
    encoder_outputs = self.encoder(
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 606, in forward
    layer_outputs = layer_module(
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 493, in forward
    self_attention_outputs = self.attention(
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 423, in forward
    self_outputs = self.self(
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 289, in forward
    mixed_query_layer = self.query(hidden_states)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/piai/anaconda3/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`


In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2780
  Batch size = 4


Epoch,Training Loss,Validation Loss
1,0.4215,0.382507
2,0.3836,0.360172
2,0.3836,0.353954


{'eval_loss': 0.3539535105228424}

In [None]:
trainer.push_to_hub()