In [1]:
import json
import os
from datasets import Dataset, DatasetDict
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Data load for Fine-tuning

In [2]:
def load_and_combine_json_files(folder_path):
    # 폴더 내의 모든 파일 목록 가져오기
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    # 빈 리스트를 생성하여 JSON 데이터를 저장
    all_data = []

    # 각 JSON 파일을 읽어서 데이터를 합치기
    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
            all_data.extend(json_data)  # 리스트를 확장하여 데이터 추가

    return all_data

In [3]:
train_path = '../data/train/labels'
test_path = '../data/validation/labels'

train_datas = load_and_combine_json_files(train_path)
test_datas = load_and_combine_json_files(test_path)

## Data Process

In [4]:
import pandas as pd
df = pd.DataFrame(train_datas)
df.dropna(subset=['GeneralPolarity'], inplace=True)
df['GeneralPolarity'] = df['GeneralPolarity'].astype(int)
df['GeneralPolarity'] = df['GeneralPolarity'].map({0: 0, 1: 1, -1: 2})
df.rename(columns={'RawText': 'text', 'GeneralPolarity':'label'}, inplace=True)
df.reset_index(drop=True, inplace=True)
train_dataset = Dataset.from_pandas(df[['text', 'label']])

df = pd.DataFrame(test_datas)
df.dropna(subset=['GeneralPolarity'], inplace=True)
df['GeneralPolarity'] = df['GeneralPolarity'].astype(int)
df['GeneralPolarity'] = df['GeneralPolarity'].map({0: 0, 1: 1, -1: 2})
df.rename(columns={'RawText': 'text', 'GeneralPolarity':'label'}, inplace=True)
df.reset_index(drop=True, inplace=True)
test_dataset = Dataset.from_pandas(df[['text', 'label']])

In [5]:
# dataset = DatasetDict({
#     'train': train_dataset,
#     'test': test_dataset
# })


## Load HuggingFace Model

In [6]:
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [78]:
train_dataset = train_dataset.map(tokenize_function, batched=True).shuffle(seed=42)
test_dataset = test_dataset.map(tokenize_function, batched=True).shuffle(seed=42)

Map: 100%|██████████| 3856/3856 [00:01<00:00, 2203.43 examples/s]
Map: 100%|██████████| 1974/1974 [00:00<00:00, 2972.89 examples/s]


## Training Setting

In [80]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [81]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) # 행에서 가장 큰 확률값
    return metric.compute(predictions=predictions, references=labels)

In [82]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [83]:
trainer = Trainer(
    model=model.to('cuda:0'),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [84]:
trainer.train()

  0%|          | 0/1446 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Save Model

In [8]:
token_path = 'first_test_train/tokenizer/'
model_path = 'first_test_train/model/'

In [62]:
# Save
tokenizer.save_pretrained(token_path)
model.save_pretrained(model_path)

('first_test_train/tokenizer/tokenizer_config.json',
 'first_test_train/tokenizer/special_tokens_map.json',
 'first_test_train/tokenizer/vocab.txt',
 'first_test_train/tokenizer/added_tokens.json',
 'first_test_train/tokenizer/tokenizer.json')

## Load Model & Test

In [9]:
#Load
tok = AutoTokenizer.from_pretrained(token_path)
mod = AutoModelForSequenceClassification.from_pretrained(model_path)

In [10]:
# 그냥 테스트용 훈련에 안쓴 데이터 아무거나 가져옴
path = r'..\147.속성기반_감정분석_데이터\01-1.정식개방데이터\Validation\02.라벨링데이터\VL_SNS_01.패션'
temp_data = load_and_combine_json_files(path)
df_test = pd.DataFrame(temp_data)
df_sample = df_test[df_test['GeneralPolarity'] == '-1']

In [11]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=mod, tokenizer=tok, device=0)

res_text = df_sample['RawText'].iloc[10]
res = classifier(res_text)

print(res)

[{'label': 'LABEL_2', 'score': 0.9972212314605713}]
