In [1]:
!pip install transformers torch datasets transformers[torch]

from google.colab import drive
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
from transformers import AutoTokenizer
import json
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetens

In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/data/'

def json_to_df(file_name):
    text = []
    label = []

    with open(file_path+file_name, 'r',encoding='utf-8') as f:
        for line in f:
            temp = json.loads(line)
            text.append(temp['input'])
            label.append(temp['output'])

    dict = {'text': text, 'label': label}
    df = pd.DataFrame(dict)

    return df

def json_to_df_test(file_name):
    text = []
    label = []

    with open(file_path+file_name, 'r',encoding='utf-8') as f:
        for line in f:
            temp = json.loads(line)
            text.append(temp['input'])

    dict = {'text': text}
    df = pd.DataFrame(dict)

    return df

train_df = json_to_df('data_train.jsonl')
test_df = json_to_df_test('data_test.jsonl')
val_df = json_to_df('data_val.jsonl')

Mounted at /content/drive


In [3]:
binary_ds = DatasetDict({'train': Dataset.from_pandas(train_df), 'val': Dataset.from_pandas(val_df), 'test': Dataset.from_pandas(test_df)})
binary_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16580
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2072
    })
})

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # CPU or GPU
print(f'Using {device} device...')

model_ckpt = 'beomi/KcELECTRA-base-v2022'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, sep_token = '[SEP]', cls_token = '[CLS]')

Using cuda device...


Downloading (…)okenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
tokenizer.tokenize(binary_ds['train']['text'][0])

['보여주',
 '##면서',
 '왜',
 '엿보',
 '##냐고',
 '비난',
 '하는것도',
 '웃기지',
 '##만',
 '.',
 '훔쳐',
 '보면서',
 '왜',
 '보여주',
 '##냐고',
 '하는',
 '사람',
 '역시',
 '우습다',
 '.',
 '.']

In [6]:
def tokenize_and_encode(data):
    return tokenizer(data['text'],
                     return_tensors = 'pt',
                     padding = True,
                     truncation = True,
                     add_special_tokens = True)

cols = binary_ds['train'].column_names
cols.remove('label')
tokenized_ds = binary_ds.map(tokenize_and_encode, remove_columns = cols)
tokenized_ds

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16580
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2072
    })
})

In [7]:
tokenized_ds['train'][0]

{'label': 1,
 'input_ids': [[2,
   11257,
   7997,
   2607,
   40904,
   9099,
   9372,
   17922,
   22658,
   4104,
   18,
   26327,
   11466,
   2607,
   11257,
   9099,
   7961,
   7944,
   8237,
   24956,
   18,
   18,
   3]],
 'token_type_ids': [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 'attention_mask': [[1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]]}

In [8]:
def flat_input(data):
    return dict(input_ids = sum(data['input_ids'], []))
def flat_token(data):
    return dict(token_type_ids = sum(data['token_type_ids'], []))
def flat_attention(data):
    return dict(attention_mask = sum(data['attention_mask'], []))

final_ds = tokenized_ds.map(flat_input, remove_columns = 'input_ids')
final_ds = final_ds.map(flat_token, remove_columns = 'token_type_ids')
final_ds = final_ds.map(flat_attention, remove_columns = 'attention_mask')
final_ds

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/16580 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16580
    })
    val: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2073
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2072
    })
})

In [9]:
final_ds.save_to_disk(file_path+'task_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/16580 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2073 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2072 [00:00<?, ? examples/s]