# 쇼핑몰 리뷰 평점 분류 경진대회

In [1]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tqdm
import pandas as pd
import torch
import datasets
import glob

from glob import glob
from tqdm import tqdm

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

## 1. 모델 설정

In [2]:
MODEL_NAME = "klue/bert-base"# "kykim/bert-kor-base" #"kykim/electra-kor-base" #"klue/roberta-large" # "klue/bert-base", "klue/bert-large", "klue/roberta-base"
batch_size = 200
learning_rate = 2e-5
NUM_EPOCH = 5

## 2. 데이터셋 로드

In [3]:
from datasets import load_dataset, Dataset

raw_train = pd.read_csv('./dataset/train.csv')
raw_test = load_dataset('csv', data_files='./dataset/test.csv')

Using custom data configuration default-24f77f946c60edab
Reusing dataset csv (/home/uj-user/.cache/huggingface/datasets/csv/default-24f77f946c60edab/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
train, valid = train_test_split(raw_train, random_state=2022, stratify=raw_train['target'], test_size=0.001)
train

Unnamed: 0,id,reviews,target
13412,13412,잠자리가 추워서 샀는데 이거때매 회사 지각 몇번했습니다,5
15013,15013,티비서하두멘보샤멘보샤해서궁금해서먹어봤습니다.첨먹엇을땐맛있었는데느끼한맛이강합니다,2
13297,13297,정말튼튼하고 나무재질도 좋습니다 다리부분도 튼튼해요 의자도 너무좋네요 배송은주문제작...,5
18720,18720,배송시까지 일주일걸립니다 당일도착 3일간 오고 결국 일주일뒤에 도착했습니다.통화해보...,2
20535,20535,손때및 기스가 잘나요!,2
...,...,...,...
11067,11067,이게 바지락인지 재첩인지..ㅠ 백원짜리만하네요...여태 여러군데서 사봤는데 가장 작...,2
4319,4319,xl 사려다 품절이여서 프리사이즈 주문했는데 많이 후회되네요. (사이즈 후회) 1....,2
12279,12279,고양이집 만드는데 썻습니다,5
7486,7486,많이 비치치 않고 길이감도 적당합니다.,5


In [5]:
train.loc[:,['reviews', 'target']]

Unnamed: 0,reviews,target
13412,잠자리가 추워서 샀는데 이거때매 회사 지각 몇번했습니다,5
15013,티비서하두멘보샤멘보샤해서궁금해서먹어봤습니다.첨먹엇을땐맛있었는데느끼한맛이강합니다,2
13297,정말튼튼하고 나무재질도 좋습니다 다리부분도 튼튼해요 의자도 너무좋네요 배송은주문제작...,5
18720,배송시까지 일주일걸립니다 당일도착 3일간 오고 결국 일주일뒤에 도착했습니다.통화해보...,2
20535,손때및 기스가 잘나요!,2
...,...,...
11067,이게 바지락인지 재첩인지..ㅠ 백원짜리만하네요...여태 여러군데서 사봤는데 가장 작...,2
4319,xl 사려다 품절이여서 프리사이즈 주문했는데 많이 후회되네요. (사이즈 후회) 1....,2
12279,고양이집 만드는데 썻습니다,5
7486,많이 비치치 않고 길이감도 적당합니다.,5


In [6]:

raw_train_dataset = Dataset.from_pandas(train[['reviews', 'target']], preserve_index=False)
raw_valid_dataset = Dataset.from_pandas(valid[['reviews', 'target']], preserve_index=False)

In [7]:
raw_train_dataset

Dataset({
    features: ['reviews', 'target'],
    num_rows: 24975
})

In [8]:
#train, valid= raw_train['train'].train_test_split(test_size=0.1).values()
review_dataset = datasets.DatasetDict({'train': raw_train_dataset, 'valid': raw_valid_dataset, 'test': raw_test['train']})
review_dataset

DatasetDict({
    train: Dataset({
        features: ['reviews', 'target'],
        num_rows: 24975
    })
    valid: Dataset({
        features: ['reviews', 'target'],
        num_rows: 25
    })
    test: Dataset({
        features: ['id', 'reviews'],
        num_rows: 25000
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tokenizer.tokenize(train['reviews'][0]))

def preprocess_function(example):
    return tokenizer(example["reviews"], truncation=True)
    
tokenized_datasets = review_dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



['조아', '##요', '처음', '##구', '##입', '싸', '##게', '##햇', '##어요']


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['reviews', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 24975
    })
    valid: Dataset({
        features: ['reviews', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25
    })
    test: Dataset({
        features: ['id', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [11]:
# 기존 정보 제거
tokenized_datasets['train'] = tokenized_datasets['train'].remove_columns(["reviews"])
tokenized_datasets['valid'] = tokenized_datasets['valid'].remove_columns(["reviews"])
tokenized_datasets['test'] = tokenized_datasets['test'].remove_columns(["id", "reviews"])

# 타겟 이름 변경
tokenized_datasets['train'] = tokenized_datasets['train'].rename_column("target", "labels")
tokenized_datasets['valid'] = tokenized_datasets['valid'].rename_column("target", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
valid_dataloader = DataLoader(tokenized_datasets["valid"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["test"], shuffle=False, batch_size=batch_size, collate_fn=data_collator)

In [13]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([200]),
 'input_ids': torch.Size([200, 73]),
 'token_type_ids': torch.Size([200, 73]),
 'attention_mask': torch.Size([200, 73])}

## 3. 모델 로드

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6) # 편의상 6으로 설정

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [15]:
from transformers import get_scheduler, AdamW
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = NUM_EPOCH * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

625




In [16]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

## 4. 모델 학습

In [17]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(NUM_EPOCH):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.save_pretrained(f"./aug_result/{MODEL_NAME}/{epoch+1}")
    tokenizer.save_pretrained(f"./aug_result/{MODEL_NAME}/{epoch+1}")

  0%|          | 0/625 [00:00<?, ?it/s]

## 5. 모델 검증

In [18]:
from torchmetrics import Accuracy

def validation_model(model):
    accuracy = Accuracy()

    prediction_list_valid = []
    target_list_valid = []

    model.eval()
    for batch in valid_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu()
        targets = batch['labels'].cpu()

        prediction_list_valid.extend(predictions)
        target_list_valid.extend(targets)
        #print(accuracy(predictions, targets)) # 매 batch 마다의 Accuracy

    print(f'valid acc: {accuracy(torch.IntTensor(prediction_list_valid), torch.IntTensor(target_list_valid)).cpu().tolist():.4f}')

    
def validation_train_model(model):
    accuracy = Accuracy()

    prediction_list_valid = []
    target_list_valid = []

    model.eval()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu()
        targets = batch['labels'].cpu()

        prediction_list_valid.extend(predictions)
        target_list_valid.extend(targets)
        #print(accuracy(predictions, targets)) # 매 batch 마다의 Accuracy

    print(f'train acc: {accuracy(torch.IntTensor(prediction_list_valid), torch.IntTensor(target_list_valid)).cpu().tolist():.4f}')

In [19]:
# 저장된 모델을 활용하여 Validation 데이터셋에서 성능확인
save_paths = sorted(glob(f"./aug_result/{MODEL_NAME}/*"))
for i, save_path in enumerate(save_paths):
    model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=6).to(device)
    print(f"{i+1}:model >> {save_path}")
    validation_model(model)

1:model >> ./aug_result/klue/bert-base/1
valid acc: 0.7600
2:model >> ./aug_result/klue/bert-base/2
valid acc: 0.7600
3:model >> ./aug_result/klue/bert-base/3
valid acc: 0.7600
4:model >> ./aug_result/klue/bert-base/4
valid acc: 0.7600
5:model >> ./aug_result/klue/bert-base/5
valid acc: 0.7600


In [20]:
# 저장된 모델을 활용하여 Train 데이터셋에서 성능확인
save_paths = sorted(glob(f"./aug_result/{MODEL_NAME}/*"))
for i, save_path in enumerate(save_paths):
    model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=6).to(device)
    print(f"{i+1}:model >> {save_path}")
    validation_train_model(model)

1:model >> ./aug_result/klue/bert-base/1
train acc: 0.7081
2:model >> ./aug_result/klue/bert-base/2
train acc: 0.7240
3:model >> ./aug_result/klue/bert-base/3
train acc: 0.7412
4:model >> ./aug_result/klue/bert-base/4
train acc: 0.7493
5:model >> ./aug_result/klue/bert-base/5
train acc: 0.7563


## 6. 모델 Evaluation 및 Submission 파일 생성

In [21]:
def evaluate_submit_model(model, eval_epoch):
    prediction_list = []
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        prediction_list.extend(predictions.cpu().tolist())

    submission = pd.read_csv("dataset/sample_submission.csv")
    submission["target"] = prediction_list
    submission.to_csv(f"./submission/submission_{MODEL_NAME.replace('/', '_')}_{eval_epoch}.csv",index=False)

In [22]:
save_paths = sorted(glob(f"./aug_result/{MODEL_NAME}/*"))
for i, save_path in enumerate(save_paths):
    model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=6).to(device)
    print(f"{i+1}:model >> {save_path}")
    evaluate_submit_model(model, i+1)

1:model >> ./aug_result/klue/bert-base/1
2:model >> ./aug_result/klue/bert-base/2
3:model >> ./aug_result/klue/bert-base/3
4:model >> ./aug_result/klue/bert-base/4
5:model >> ./aug_result/klue/bert-base/5
