In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
from transformers import AutoTokenizer
import json
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected pack

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/'

def json_to_df(file_name):
    text = []
    label = []

    with open(file_path+file_name, 'r',encoding='utf-8') as f:
        for line in f:
            temp = json.loads(line)
            text.append(temp['input'])
            label.append(temp['output'])

    dict = {'text': text, 'label': label}
    df = pd.DataFrame(dict)

    return df

def json_to_df_test(file_name):
    text = []
    label = []

    with open(file_path+file_name, 'r',encoding='utf-8') as f:
        for line in f:
            temp = json.loads(line)
            text.append(temp['input'])

    dict = {'text': text}
    df = pd.DataFrame(dict)

    return df

train_df = json_to_df('data_train.jsonl')
test_df = json_to_df_test('data_test.jsonl')
val_df = json_to_df('data_dev.jsonl')

Mounted at /content/drive


In [3]:
binary_ds = DatasetDict({'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)})
binary_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16580
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2072
    })
})

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

model_ckpt = "beomi/kcbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')

Using cuda device...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

In [5]:
tokenizer.tokenize(binary_ds['train']['text'][0])

['보여주',
 '##면서',
 '왜',
 '엿',
 '##보냐',
 '##고',
 '비난',
 '하는것도',
 '웃기지',
 '##만',
 '.',
 '훔쳐',
 '보면서',
 '왜',
 '보여주',
 '##냐고',
 '하는',
 '사람',
 '역시',
 '우습다',
 '.',
 '.']

In [6]:
def tokenize_and_encode(data):
    return tokenizer(data['text'],
                     return_tensors = 'pt',
                     padding = True,
                     truncation = True,
                     add_special_tokens = True)

cols = binary_ds['train'].column_names
cols.remove('label')
tokenized_ds = binary_ds.map(tokenize_and_encode, remove_columns = cols)
tokenized_ds

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16580
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2072
    })
})

In [7]:
tokenized_ds['train'][0]

{'label': 1,
 'input_ids': [[2,
   10869,
   8031,
   2332,
   2280,
   19032,
   4034,
   9230,
   17771,
   22006,
   4049,
   17,
   26763,
   11274,
   2332,
   10869,
   9147,
   7996,
   7978,
   8294,
   24644,
   17,
   17,
   3]],
 'token_type_ids': [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 'attention_mask': [[1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]]}

In [8]:
def flat_input(data):
    return dict(input_ids = sum(data['input_ids'], []))
def flat_token(data):
    return dict(token_type_ids = sum(data['token_type_ids'], []))
def flat_attention(data):
    return dict(attention_mask = sum(data['attention_mask'], []))

final_ds = tokenized_ds.map(flat_input, remove_columns = 'input_ids')
final_ds = final_ds.map(flat_token, remove_columns = 'token_type_ids')
final_ds = final_ds.map(flat_attention, remove_columns = 'attention_mask')
final_ds

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16580
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2072
    })
})

In [9]:
final_ds.save_to_disk(file_path+'kcbert_task_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/16580 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2073 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2072 [00:00<?, ? examples/s]

In [9]:
test_df.to_csv(file_path+'kcbert_task_testset.csv', encoding = 'utf-8-sig', index = False)