## 설치

In [8]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
!pip install wandb -qU

In [11]:
!kaggle datasets download -d minjaechoi99/aihub-groom

aihub-groom.zip: Skipping, found more recently modified local copy (use --force to force download)


In [12]:
!unzip /content/aihub-groom.zip

Archive:  /content/aihub-groom.zip
replace data/Training/talksets-train-1.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: data/Training/talksets-train-1.json  
  inflating: data/Training/talksets-train-2.json  
  inflating: data/Training/talksets-train-3.json  
  inflating: data/Training/talksets-train-4.json  
  inflating: data/Training/talksets-train-5.json  
  inflating: data/Validation/talksets-train-6.json  


In [13]:
import torch
import torch.optim as optim

import numpy as np
import tqdm as tqdm

from transformers import(
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    default_data_collator,
    EarlyStoppingCallback,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_constant_schedule,
    AdamW
)

In [14]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmjchoi[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## readjson

In [15]:
from pathlib import Path
import json, re
def read_aihub(path):
    path = Path(path)
    with open(path, 'rb') as f:
        print(f)
        squad_dict = json.load(f)
    texts = []
    intensities = []
    for group in squad_dict:
        for passage in group['sentences']:
            text = passage['origin_text']
            text = re.sub('#.*?#', '[UNK]', text)
            intensity = passage['intensity']
            if intensity == 1:
                continue
            texts.append(text)
            if intensity < 1:
                intensity = 0
            else:
                intensity = 1
            intensities.append(intensity)

    return texts ,intensities

In [16]:
import pandas as pd

texts = []
intensities = []
for i in range(1,6):
    file_path =  f'/content/data/Training/talksets-train-{i}.json'
    ttexts , iintensities = read_aihub(file_path)
    texts += ttexts
    intensities += iintensities
ai_df = pd.DataFrame(list(zip(texts , intensities)),
            columns =['texts','intensities'])

file_path = '/content/data/Validation/talksets-train-6.json'

texts , intensities = read_aihub(file_path)
test_df = pd.DataFrame(list(zip(texts , intensities)),
               columns =['texts','intensities'])

<_io.BufferedReader name='/content/data/Training/talksets-train-1.json'>
<_io.BufferedReader name='/content/data/Training/talksets-train-2.json'>
<_io.BufferedReader name='/content/data/Training/talksets-train-3.json'>
<_io.BufferedReader name='/content/data/Training/talksets-train-4.json'>
<_io.BufferedReader name='/content/data/Training/talksets-train-5.json'>
<_io.BufferedReader name='/content/data/Validation/talksets-train-6.json'>


In [17]:
ai_df

Unnamed: 0,texts,intensities
0,부랴부랴 왔는데 아무도 안왔네. 시간개념들이 없네,1
1,맞아. 사람들이 진짜 개념이없다,1
2,저렇게는 살지 말아야지,1
3,그러게 게으르고 멍청한 사람들은 맞아야해,1
4,특히 벗방보는 애들은 진짜 거세 시켜야함,1
...,...,...
296066,뷔응신처럼 죽지도 못하면서 괜히 뛰어들기는,1
296067,오죽 하면 그러겠나,0
296068,너 얼굴 보면 한숨 나온다.,1
296069,신경 끄세요.,0


In [18]:
ai_df = ai_df.sample(frac=1,random_state=43)

In [19]:
spliter = int(len(ai_df)*0.8)

In [20]:
train_df = ai_df.iloc[:spliter,:]
dev_df = ai_df.iloc[spliter:,:]

In [21]:
dev_df

Unnamed: 0,texts,intensities
42102,방탕출 게임 하러 왔는데 못 나가겠어.. 갇힌 것 같아,0
77099,음식은 개같이 만들어서 대리점주 전부 망하게 해놓고 지만 잘 먹고 잘사는 샛기.,1
34793,본인 등판했네 ㅉㅉ 걍 발 닦고 잠이나 자라,0
13545,니 얼굴을 보면 알지 딱바도 너 못생겼잖아,1
111097,얘 춤선 장난 아님 일단 보고 까라,0
...,...,...
57651,윗사람들도 다 똥만찼지 뭐.,1
40753,저러니까 지나가는 사람들한테도 손가락질을 당하지.,1
248063,돼지는 뭘 해도 돼지일 뿐이야,1
150848,이 늦은 시간에 자꾸 연락하는 개념 가출한 놈이 있다?,1


In [22]:
train_df

Unnamed: 0,texts,intensities
30498,그럼 [UNK] 목소리는 어때?,0
184719,[UNK] 목소리 정말 좋은 거 같아,0
286333,아비 어미 제대로 없는 애들은 가까이 하게 하면 안 돼,1
160138,야 어제 축제에 옆학교 댄스팀 왔다며?,0
284918,아니 왜?,0
...,...,...
95095,뭐야 왜이러세요. 신고합니다.,0
73388,기왕 넘어오는 거 침대까지 와서 자빠져주면 좋겠다.,1
279721,ㅋㅋ뺳국 개네 노래 후보정이면서 잘하는 척 감미로운척할 게 진짜 지관통으로 후리고 싶음,1
108739,저 연예인은 기부를 매번 정말 많이 하네.,0


In [24]:
from datasets import Dataset

train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)
test_data = Dataset.from_pandas(test_df)
dev_data

Dataset({
    features: ['texts', 'intensities', '__index_level_0__'],
    num_rows: 59215
})

In [25]:
train_data

Dataset({
    features: ['texts', 'intensities', '__index_level_0__'],
    num_rows: 236856
})

## tokenize

In [29]:
model_name = 'beomi/KcELECTRA-base-v2022'
max_length = 128

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [26]:
def tokenizeWithLabel(data):
    tokenized_datas = tokenizer(
        data['texts'],
        max_length=max_length,
        padding="max_length",
        truncation="only_second"
    )
    tokenized_datas['labels'] = data['intensities']
    return tokenized_datas

In [27]:
def tokenizeWithoutLabel(data):
    tokenized_datas = tokenizer(
        data['texts'],
        max_length=max_length,
        padding="max_length",
        truncation="only_second"
    )
    return tokenized_datas

In [32]:
train_tokenized_datasets = train_data.map(tokenizeWithLabel, batched=True, remove_columns=train_data.column_names)
dev_tokenized_datasets = dev_data.map(tokenizeWithLabel, batched=True, remove_columns=dev_data.column_names)
test_tokenized_datasets = test_data.map(tokenizeWithoutLabel, batched=True, remove_columns=test_data.column_names)

  0%|          | 0/237 [00:00<?, ?ba/s]

  0%|          | 0/60 [00:00<?, ?ba/s]

  0%|          | 0/38 [00:00<?, ?ba/s]

In [33]:
train_tokenized_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 236856
})

## train with sweep

In [42]:
sweep_configuration = {
    'method': 'grid',
    'name': 'sweep',
    'metric': {'goal': 'minimize', 'name': 'eval/loss'},
    'parameters': 
    {
        'batch_size': {'values': [256]},
        'epochs': {'values': [10]},
        'lr': {'values': [5e-5]}
     }
}

In [43]:
max_batch_size = 256
def train():
    torch.cuda.empty_cache()
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    grouped_params = model.parameters()
    run = wandb.init(config=sweep_configuration, entity="groom2team")
    batch_size = wandb.config.batch_size if wandb.config.batch_size < max_batch_size else max_batch_size
    gradient_accumulation_steps= wandb.config.batch_size // max_batch_size
    epochs = wandb.config.epochs
    total_steps = int(len(train_tokenized_datasets)/wandb.config.batch_size*epochs)
    learning_rate = wandb.config.lr
    data_collator = default_data_collator
    grouped_params = model.parameters()
    optimizer=AdamW(grouped_params, lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                num_warmup_steps=0,
                                              num_training_steps=total_steps+1)
    #scheduler=get_cosine_schedule_with_warmup(optimizer=optimizer,
    #                                          num_warmup_steps=total_steps//(epochs*3),
    #                                          num_training_steps=total_steps+1)
    #scheduler=get_constant_schedule(optimizer=optimizer)
    optimizers = optimizer, scheduler
    args = TrainingArguments(
        f"{model_name}-finetuned",
        evaluation_strategy = "steps",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        gradient_accumulation_steps = gradient_accumulation_steps,
        report_to="wandb",
        run_name="utopia",
        logging_steps = total_steps//200,
        eval_steps = total_steps//100,
        save_steps = total_steps//100,
        weight_decay=0.0,
        save_total_limit = 2,
        load_best_model_at_end=True
    )
    trainer = Trainer(
        model,
        args,
        train_dataset=train_tokenized_datasets,
        eval_dataset=dev_tokenized_datasets,
        data_collator=data_collator,
        tokenizer=tokenizer,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
        optimizers=optimizers
    )
    trainer.train()# train 하고
    trainer.save_model(output_dir= 'pytorch_finetuned') # trainer에서 실행된 model save
    artifact = wandb.Artifact(name='pytorch_finetuned', type='model') # wandb에 해당 모델 version 관리.
    artifact.add_dir('pytorch_finetuned', name='best_model_at_end')
    run.log_artifact(artifact)

In [44]:
sweep_id = wandb.sweep(sweep=sweep_configuration, project='aihub_pj3', entity='groom2team')
count = 1

Create sweep with ID: 0pgdl1hs
Sweep URL: https://wandb.ai/groom2team/aihub_pj3/sweeps/0pgdl1hs


In [None]:
wandb.agent(sweep_id, function=train, count=count)

[34m[1mwandb[0m: Agent Starting Run: gj2zfyyp with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	lr: 5e-05
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--beomi--KcELECTRA-base-v2022/snapshots/4431b6c7ad00f82fd50880864574cef97e0a368b/config.json
Model config ElectraConfig {
  "_name_or_path": "beomi/KcELECTRA-base-v2022",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
 

PyTorch: setting up devices
***** Running training *****
  Num examples = 236856
  Num Epochs = 10
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 9260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
92,0.3196,0.298174
184,0.2864,0.285126


***** Running Evaluation *****
  Num examples = 59215
  Batch size = 256
Saving model checkpoint to beomi/KcELECTRA-base-v2022-finetuned/checkpoint-92
Configuration saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-92/config.json
Model weights saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-92/pytorch_model.bin
tokenizer config file saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-92/tokenizer_config.json
Special tokens file saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-92/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59215
  Batch size = 256
Saving model checkpoint to beomi/KcELECTRA-base-v2022-finetuned/checkpoint-184
Configuration saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-184/config.json
Model weights saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-184/pytorch_model.bin
tokenizer config file saved in beomi/KcELECTRA-base-v2022-finetuned/checkpoint-184/tokenizer_config.json
Special tokens file saved 

In [None]:
wandb.finish() # wandb 종료

AttributeError: ignored