# 자연어 처리 감정분석 전이학습

- GPU로 런타임 연결하기

In [None]:
!pip install -qq torch transformers datasets numpy evaluate pandas

In [None]:
!pip install -qq accelerate -U

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
import pandas as pd
import numpy as np
import evaluate

In [None]:
dataset = load_dataset("sepidmnorozy/Korean_sentiment")
dataset

In [None]:
print(dataset['train'][3118])
print(dataset['train'][14310])

## 토큰화 Tokenize

https://huggingface.co/kykim/bert-kor-base

In [None]:
model_name = "kykim/bert-kor-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

In [None]:
def tokenizer_func(x):
    return tokenizer(
        x['text'],
        padding="max_length",
        max_length=256,
        truncation=True
    )

In [None]:
tokenized_datasets = dataset.map(tokenizer_func, batched=True)

In [None]:
train_num_samples = 10000

train_ds = tokenized_datasets['train'].shuffle(seed=42).select(range(train_num_samples))
eval_ds = tokenized_datasets['validation'].shuffle(seed=42)

## 전이학습 Transfer Learning

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

### Hyperparameters

In [None]:
bs = 32
epochs = 4
lr = 1e-5

https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

In [None]:
args = TrainingArguments(
    'outputs',
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    gradient_accumulation_steps=4, # until bs=128
    eval_accumulation_steps=4,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none'
)

### Metrics

In [None]:
metric = evaluate.load('accuracy')

# all Transformers models return logits
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

### Trainer

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./mymodel")

## 추론 Inference

In [None]:
pipe = pipeline('text-classification', model="./mymodel")

### 테스트셋 사용

In [None]:
test_data = dataset['validation'].shuffle(seed=424)[:100]
td = pd.DataFrame(test_data)
td

In [None]:
preds = pipe(td['text'].tolist())

preds_df = pd.DataFrame(preds)
preds_df

In [None]:
preds_df.rename(columns={'label':'pred'}, inplace=True)
preds_df['pred'] = preds_df['pred'].map({'LABEL_1': 1, 'LABEL_0': 0})

preds_df = pd.concat([preds_df, td], axis=1)
preds_df

In [None]:
mask = preds_df['pred'] == preds_df['label']

len(preds_df[mask])

### 내 데이터셋

In [None]:
txts = [
    {'label': 0, 'text': "절대로 강추할 수 없는 영화"},
    {'label': 0, 'text': "절대로 추천할 수 없는 영화"},
    {'label': 1, 'text': "또 보고 싶다."},
    {'label': 0, 'text': "이걸 보면서 웃을 수는 없다."},
    {'label': 0, 'text': "처음에는 재미있었는데 갈수록 산으로 가는 내용."},
    {'label': 1, 'text': "요즘 재미없는 영화만 나오는데 신선한 충격을 준 영화."},
    {'label': 1, 'text': "유명한 감독이나 배우가 나오지는 않지만 스토리가 감동"}
]

txts_td = pd.DataFrame(txts)
txts_td

In [None]:
preds_txts = pipe(txts_td['text'].tolist())

In [None]:
preds_txts_df = pd.DataFrame(preds_txts)
preds_txts_df.rename(columns={'label':'pred'}, inplace=True)
preds_txts_df['pred'] = preds_txts_df['pred'].map({'LABEL_1': 1, 'LABEL_0': 0})

preds_txts_df = pd.concat([preds_txts_df, txts_td], axis=1)
preds_txts_df

### 파이프라인 사용하지 않고 모델 로딩

In [None]:
model_inf = AutoModelForSequenceClassification.from_pretrained("./mymodel")
inputs = tokenizer(txts_td['text'].tolist(), padding=True, return_tensors="pt")

with torch.no_grad():
    logits = model_inf(**inputs).logits

In [None]:
logits.argmax(axis=1)