In [1]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import tqdm

import numpy as np
import pandas as pd
import torch
import datasets


from glob import glob
from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import train_test_split
from torch.utils.data.dataset import random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
MODEL_NAME = "kykim/electra-kor-base" #"klue/roberta-large"
NUM_EPOCH = 2

In [3]:
from datasets import load_dataset

raw_test = load_dataset('csv', data_files='./dataset/aug_test_en_ko_jp.csv')
raw_test

Using custom data configuration default-6e0c7d1ef0e700df


Downloading and preparing dataset csv/default to /home/uj-user/.cache/huggingface/datasets/csv/default-6e0c7d1ef0e700df/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/uj-user/.cache/huggingface/datasets/csv/default-6e0c7d1ef0e700df/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'reviews', 'en_ko_review', 'jp_ko_review', 'jp_review'],
        num_rows: 25000
    })
})

In [4]:
review_dataset = datasets.DatasetDict({'test': raw_test['train']})
review_dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'reviews', 'en_ko_review', 'jp_ko_review', 'jp_review'],
        num_rows: 25000
    })
})

In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(tokenizer.tokenize(raw_test['train']['reviews'][0]))

def tokenize_function(example):
    return tokenizer(example["jp_ko_review"], truncation=True)

tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



['채소', '##가', '약간', '시', '##들어', '있어요']


  0%|          | 0/25 [00:00<?, ?ba/s]

In [7]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['id', 'reviews', 'en_ko_review', 'jp_ko_review', 'jp_review', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["id", "reviews", "en_ko_review", "jp_ko_review","jp_review"])
tokenized_datasets.set_format("torch")

In [9]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [10]:
from torch.utils.data import DataLoader
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=64, collate_fn=data_collator, shuffle=False)

In [12]:
from transformers import AutoModelForSequenceClassification

model_path = f'/home/uj-user/yohan/review/aug_result/{MODEL_NAME}/{NUM_EPOCH-1}'
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=6)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [13]:
prediction_list = []
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_list.extend(predictions.cpu().tolist())

In [14]:

# 제출용 파일 불러오기
submission = pd.read_csv("dataset/sample_submission.csv") 
submission.head() 

# 예측 값 넣어주기
submission["target"] = prediction_list
submission.head()

Unnamed: 0,id,target
0,0,2
1,1,1
2,2,5
3,3,1
4,4,1


In [16]:
submission.to_csv(f"./submission/submission_{MODEL_NAME.replace('/', '_')}_{NUM_EPOCH}_jp_ko.csv",index=False)