In [1]:
import os
from datasets import load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# dataset_name = 'beomi/KoAlpaca-v1.1a'
dataset_name = 'nlpai-lab/kullm-v2'
# dataset_name = 'nlpai-lab/kullm-v2'
# dataset_name = 'junelee/sharegpt_deepl_ko'

model_name = 'EleutherAI/polyglot-ko-1.3b'
# model_name = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token':'<|endoftext|>'})

dataset = load_dataset(dataset_name)
dataset = dataset.filter(lambda example: len(example['instruction'])>0, num_proc=24) # filtering under 0 token length

# filtering over 1024 token length
def encode_preprocess(examples):
    return tokenizer(examples['instruction'])#, padding=True, return_tensors='pt')

# truncate max length, add padding true
def encode_pad_preprocess(examples):
    return tokenizer(examples['instruction'], max_length=512, truncation=True, padding=True, return_tensors='pt')

# extract truncated sentence max_length 1024
def decode_process(examples):
    return {'trunc_instruction': tokenizer.decode(examples['input_ids'], skip_special_tokens=True)}

trunc_data = dataset.map(encode_preprocess, batched=True, num_proc=24)
trunc_data = trunc_data.filter(lambda example: len(example['input_ids'])<512, num_proc=24)
encode_pad_data = trunc_data.map(encode_pad_preprocess, batched=True, num_proc=24)
preprocessed_data = encode_pad_data.map(decode_process, num_proc=96)
print(preprocessed_data)

  from .autonotebook import tqdm as notebook_tqdm


[2023-09-08 02:55:53,200] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
DatasetDict({
    train: Dataset({
        features: ['id', 'instruction', 'input', 'output', 'input_ids', 'token_type_ids', 'attention_mask', 'trunc_instruction'],
        num_rows: 145175
    })
})


In [2]:
pd.DataFrame(preprocessed_data['train']['instruction'])[0].str.len().describe()

count    145175.000000
mean         63.464481
std         111.533839
min           1.000000
25%          23.000000
50%          32.000000
75%          52.000000
max        1245.000000
Name: 0, dtype: float64

In [4]:
perplexity = evaluate.load("perplexity", module_type="metric")
instruction = preprocessed_data['train']['trunc_instruction']
instruction

len_instruction = [len(text) for text in instruction]
ppl_results = perplexity.compute(model_id=model_name, add_start_token=False, predictions=instruction)
round_ppl_results = [round(ppl, 2) for ppl in  ppl_results['perplexities']]

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.92s/it]


AssertionError: When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings.

: 

In [None]:
dict_ppl_instruction = dict(zip(instruction, round_ppl_results))
df_dataset = pd.DataFrame(sorted(dict_ppl_instruction.items(), key=lambda x: x[0]), columns=['instruction', 'ppl'])
df_dataset

Unnamed: 0,instruction,ppl
0,"""(주)회사명과 회사명(주)의 차이점은 무엇인가요?""\n\n 질문 본문: ""주식회사...",16.44
1,"""-에요""와 ""-예요""의 쓰임에 대해서 자세히 설명해주세요.",23.21
2,"""..등이 있다."", ""..등등이 있다.""할때 등과 등등은 무슨 차이가 있나요? 의...",17.52
3,"""02:42:35 OST"" 이런 식으로 시간 뒤에 붙는 OST는 무슨 뜻인가요?",99.94
4,"""1+1=0"" 이라는 문구를 어디서 본 것 같은데, 어디서 봤는지 전혀 기억이 나지...",10.19
...,...,...
21145,"힙합 음악에서 자주 사용되는 용어인 플로우, 펀치라인, 그리고 라인의 의미가 무엇인...",16.16
21146,"힙합 패션을 입을 때 세미힙합, 무난한 캐쥬얼, 리얼힙합을 고민하고 있습니다. 어떤...",29.49
21147,"힙합(랩)에 대해 기본적인 지식과 용어, 그리고 영향력 있는 곡들을 알고 싶습니다.",14.79
21148,힙합에서 자주 쓰이는 용어 'AKA'는 무슨 뜻인가요?,17.68


In [None]:
datasetname = dataset_name.split('/')[-1]
df_dataset.to_json(f"{datasetname}_ppl.json", orient='records', lines=True, force_ascii=False)

In [None]:
from datasets import Dataset, DatasetDict

dataset = preprocessed_data['train'].add_column("ppl", round_ppl_results)
dataset = dataset.add_column("len", len_instruction)
dataset = dataset.remove_columns(['input_ids', 'attention_mask', 'trunc_instruction',])
dataset

Dataset({
    features: ['instruction', 'output', 'url', 'token_type_ids', 'ppl', 'len'],
    num_rows: 21155
})

In [None]:
datasetname = dataset_name.split('/')[-1]
model_name = model_name.split('/')[-1]
dataset.push_to_hub(f'nayohan/{datasetname}_ppl_{model_name}')

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#pd.read_json('/home/uj-user/Yo/HiT5/HCLT/nlpai-lab_kullm-v2_ppl_polyglot.json', orient='records', lines=True)