In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/daum_movie_review.csv')
y = [0 if rate < 6 else 1 for rate in df.rating]

X_train_val, X_test, y_train_val, y_test = train_test_split(
    df.review.tolist(), y, random_state=0
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, random_state=0
)

print('#Train set size:', len(X_train))
print('#Validation set size:', len(X_val))
print('#Test set size:', len(X_test))

#Train set size: 8282
#Validation set size: 2761
#Test set size: 3682


In [5]:
import torch
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, reference=labels)

class OurDataset(torch.utils.data.Dataset):
  def __init__(self, inputs, labels):
    self.inputs = inputs
    self.labels = labels
  
  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item
  
  def __len__(self):
    return len(self.labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
print(tokenizer.tokenize("안녕하세요. 반갑습니다."))
inputs = tokenizer("안녕하세요. 반갑습니다.")
print(inputs)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

['안', '##녕', '##하', '##세', '##요', '.', '반', '##갑', '##습', '##니다', '.']
{'input_ids': [101, 9521, 118741, 35506, 24982, 48549, 119, 9321, 118610, 119081, 48345, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
from transformers import BertForSequenceClassification 
from transformers import Trainer, TrainingArguments

train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased")

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=2,              
    evaluation_strategy="steps",     
    eval_steps = 500,                
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,  
    warmup_steps=200,               
    weight_decay=0.01,               
)

trainer = Trainer(
    model=model,                     
    args=training_args,             
    train_dataset=train_dataset,     
    eval_dataset=val_dataset,      
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
trainer.save_model("my_model")

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

In [None]:
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
del model
del trainer
torch.cuda.empty_cache()

In [None]:
from kobert_tokenizer import KoBERTTokenizer
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

print(tokenizer.tokenize("안녕하세요. 반갑습니다."))
inputs = tokenizer("안녕하세요. 반갑습니다.")
print(inputs)

In [None]:
from transformers import BertModel
from torch.utils.data import DataLoader

train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

bert_model = BertModel.from_pretrained('skt/kobert-base-v1')

class MyModel(torch.nn.Module):
    def __init__(self, pretrained_model, token_size, num_labels): 
        super(MyModel, self).__init__()
        self.token_size = token_size
        self.num_labels = num_labels
        self.pretrained_model = pretrained_model

        self.classifier = torch.nn.Linear(self.token_size, self.num_labels)

    def forward(self, inputs):
        outputs = self.pretrained_model(**inputs)
        bert_clf_token = outputs.last_hidden_state[:,0,:]
        
        return self.classifier(bert_clf_token)

model = MyModel(bert_model, num_labels=2, token_size=bert_model.config.hidden_size) 

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import time

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)  
model.train()    

optim = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) 
criterion = torch.nn.CrossEntropyLoss()  

num_epochs = 2      
total_training_steps = num_epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer=optim,
                                            num_training_steps=total_training_steps,
                                            num_warmup_steps=200)

start = time.time() 
train_loss = 0
eval_steps = 500
step = 0

for epoch in range(num_epochs):
    for batch in train_loader:
        model.train()    
        optim.zero_grad()  
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'} 
        labels = batch['labels'].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, F.one_hot(labels, num_classes=2).float())

        train_loss += loss
        loss.backward() 
        optim.step()    
        scheduler.step() 
        
        step += 1
        if step % eval_steps == 0: 
            with torch.no_grad():
                val_loss = 0
                model.eval()
                for batch in val_loader:
                    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                    labels = batch['labels'].to(device)
                    outputs = model(inputs)
                    loss = criterion(outputs, F.one_hot(labels, num_classes=2).float()) 
                    val_loss += loss
                avg_val_loss = val_loss / len(val_loader)
            avg_train_loss = train_loss / eval_steps
            elapsed = time.time() - start
            print('Step %d, elapsed time: %.2f, train loss: %.4f, validation loss: %.4f' 
                  % (step, elapsed, avg_train_loss, avg_val_loss))
            train_loss = 0

In [None]:
from datasets import load_metric

metric= load_metric("accuracy")
model.eval()
for batch in test_loader:
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    labels = batch['labels'].to(device)
    
    with torch.no_grad():
        outputs = model(inputs)

    predictions = torch.argmax(outputs, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()